Index: head/sys/dev/bxe/bxe.c =================================================================== --- head/sys/dev/bxe/bxe.c (revision 275357) +++ head/sys/dev/bxe/bxe.c (revision 275358) @@ -1,18811 +1,18810 @@ /*- * Copyright (c) 2007-2014 QLogic Corporation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define BXE_DRIVER_VERSION "1.78.78" #include "bxe.h" #include "ecore_sp.h" #include "ecore_init.h" #include "ecore_init_ops.h" #include "57710_int_offsets.h" #include "57711_int_offsets.h" #include "57712_int_offsets.h" /* * CTLTYPE_U64 and sysctl_handle_64 were added in r217616. Define these * explicitly here for older kernels that don't include this changeset. */ #ifndef CTLTYPE_U64 #define CTLTYPE_U64 CTLTYPE_QUAD #define sysctl_handle_64 sysctl_handle_quad #endif /* * CSUM_TCP_IPV6 and CSUM_UDP_IPV6 were added in r236170. Define these * here as zero(0) for older kernels that don't include this changeset * thereby masking the functionality. */ #ifndef CSUM_TCP_IPV6 #define CSUM_TCP_IPV6 0 #define CSUM_UDP_IPV6 0 #endif /* * pci_find_cap was added in r219865. Re-define this at pci_find_extcap * for older kernels that don't include this changeset. */ #if __FreeBSD_version < 900035 #define pci_find_cap pci_find_extcap #endif #define BXE_DEF_SB_ATT_IDX 0x0001 #define BXE_DEF_SB_IDX 0x0002 /* * FLR Support - bxe_pf_flr_clnup() is called during nic_load in the per * function HW initialization. */ #define FLR_WAIT_USEC 10000 /* 10 msecs */ #define FLR_WAIT_INTERVAL 50 /* usecs */ #define FLR_POLL_CNT (FLR_WAIT_USEC / FLR_WAIT_INTERVAL) /* 200 */ struct pbf_pN_buf_regs { int pN; uint32_t init_crd; uint32_t crd; uint32_t crd_freed; }; struct pbf_pN_cmd_regs { int pN; uint32_t lines_occup; uint32_t lines_freed; }; /* * PCI Device ID Table used by bxe_probe(). */ #define BXE_DEVDESC_MAX 64 static struct bxe_device_type bxe_devs[] = { { BRCM_VENDORID, CHIP_NUM_57710, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57710 10GbE" }, { BRCM_VENDORID, CHIP_NUM_57711, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57711 10GbE" }, { BRCM_VENDORID, CHIP_NUM_57711E, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57711E 10GbE" }, { BRCM_VENDORID, CHIP_NUM_57712, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57712 10GbE" }, { BRCM_VENDORID, CHIP_NUM_57712_MF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57712 MF 10GbE" }, #if 0 { BRCM_VENDORID, CHIP_NUM_57712_VF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57712 VF 10GbE" }, #endif { BRCM_VENDORID, CHIP_NUM_57800, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57800 10GbE" }, { BRCM_VENDORID, CHIP_NUM_57800_MF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57800 MF 10GbE" }, #if 0 { BRCM_VENDORID, CHIP_NUM_57800_VF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57800 VF 10GbE" }, #endif { BRCM_VENDORID, CHIP_NUM_57810, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57810 10GbE" }, { BRCM_VENDORID, CHIP_NUM_57810_MF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57810 MF 10GbE" }, #if 0 { BRCM_VENDORID, CHIP_NUM_57810_VF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57810 VF 10GbE" }, #endif { BRCM_VENDORID, CHIP_NUM_57811, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57811 10GbE" }, { BRCM_VENDORID, CHIP_NUM_57811_MF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57811 MF 10GbE" }, #if 0 { BRCM_VENDORID, CHIP_NUM_57811_VF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57811 VF 10GbE" }, #endif { BRCM_VENDORID, CHIP_NUM_57840_4_10, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57840 4x10GbE" }, #if 0 { BRCM_VENDORID, CHIP_NUM_57840_2_20, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57840 2x20GbE" }, #endif { BRCM_VENDORID, CHIP_NUM_57840_MF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57840 MF 10GbE" }, #if 0 { BRCM_VENDORID, CHIP_NUM_57840_VF, PCI_ANY_ID, PCI_ANY_ID, "QLogic NetXtreme II BCM57840 VF 10GbE" }, #endif { 0, 0, 0, 0, NULL } }; MALLOC_DECLARE(M_BXE_ILT); MALLOC_DEFINE(M_BXE_ILT, "bxe_ilt", "bxe ILT pointer"); /* * FreeBSD device entry points. */ static int bxe_probe(device_t); static int bxe_attach(device_t); static int bxe_detach(device_t); static int bxe_shutdown(device_t); /* * FreeBSD KLD module/device interface event handler method. */ static device_method_t bxe_methods[] = { /* Device interface (device_if.h) */ DEVMETHOD(device_probe, bxe_probe), DEVMETHOD(device_attach, bxe_attach), DEVMETHOD(device_detach, bxe_detach), DEVMETHOD(device_shutdown, bxe_shutdown), #if 0 DEVMETHOD(device_suspend, bxe_suspend), DEVMETHOD(device_resume, bxe_resume), #endif /* Bus interface (bus_if.h) */ DEVMETHOD(bus_print_child, bus_generic_print_child), DEVMETHOD(bus_driver_added, bus_generic_driver_added), KOBJMETHOD_END }; /* * FreeBSD KLD Module data declaration */ static driver_t bxe_driver = { "bxe", /* module name */ bxe_methods, /* event handler */ sizeof(struct bxe_softc) /* extra data */ }; /* * FreeBSD dev class is needed to manage dev instances and * to associate with a bus type */ static devclass_t bxe_devclass; MODULE_DEPEND(bxe, pci, 1, 1, 1); MODULE_DEPEND(bxe, ether, 1, 1, 1); DRIVER_MODULE(bxe, pci, bxe_driver, bxe_devclass, 0, 0); /* resources needed for unloading a previously loaded device */ #define BXE_PREV_WAIT_NEEDED 1 struct mtx bxe_prev_mtx; MTX_SYSINIT(bxe_prev_mtx, &bxe_prev_mtx, "bxe_prev_lock", MTX_DEF); struct bxe_prev_list_node { LIST_ENTRY(bxe_prev_list_node) node; uint8_t bus; uint8_t slot; uint8_t path; uint8_t aer; /* XXX automatic error recovery */ uint8_t undi; }; static LIST_HEAD(, bxe_prev_list_node) bxe_prev_list = LIST_HEAD_INITIALIZER(bxe_prev_list); static int load_count[2][3] = { {0} }; /* per-path: 0-common, 1-port0, 2-port1 */ /* Tunable device values... */ SYSCTL_NODE(_hw, OID_AUTO, bxe, CTLFLAG_RD, 0, "bxe driver parameters"); /* Debug */ unsigned long bxe_debug = 0; SYSCTL_ULONG(_hw_bxe, OID_AUTO, debug, CTLFLAG_RDTUN, &bxe_debug, 0, "Debug logging mode"); /* Interrupt Mode: 0 (IRQ), 1 (MSI/IRQ), and 2 (MSI-X/MSI/IRQ) */ static int bxe_interrupt_mode = INTR_MODE_MSIX; SYSCTL_INT(_hw_bxe, OID_AUTO, interrupt_mode, CTLFLAG_RDTUN, &bxe_interrupt_mode, 0, "Interrupt (MSI-X/MSI/INTx) mode"); /* Number of Queues: 0 (Auto) or 1 to 16 (fixed queue number) */ static int bxe_queue_count = 4; SYSCTL_INT(_hw_bxe, OID_AUTO, queue_count, CTLFLAG_RDTUN, &bxe_queue_count, 0, "Multi-Queue queue count"); /* max number of buffers per queue (default RX_BD_USABLE) */ static int bxe_max_rx_bufs = 0; SYSCTL_INT(_hw_bxe, OID_AUTO, max_rx_bufs, CTLFLAG_RDTUN, &bxe_max_rx_bufs, 0, "Maximum Number of Rx Buffers Per Queue"); /* Host interrupt coalescing RX tick timer (usecs) */ static int bxe_hc_rx_ticks = 25; SYSCTL_INT(_hw_bxe, OID_AUTO, hc_rx_ticks, CTLFLAG_RDTUN, &bxe_hc_rx_ticks, 0, "Host Coalescing Rx ticks"); /* Host interrupt coalescing TX tick timer (usecs) */ static int bxe_hc_tx_ticks = 50; SYSCTL_INT(_hw_bxe, OID_AUTO, hc_tx_ticks, CTLFLAG_RDTUN, &bxe_hc_tx_ticks, 0, "Host Coalescing Tx ticks"); /* Maximum number of Rx packets to process at a time */ static int bxe_rx_budget = 0xffffffff; SYSCTL_INT(_hw_bxe, OID_AUTO, rx_budget, CTLFLAG_TUN, &bxe_rx_budget, 0, "Rx processing budget"); /* Maximum LRO aggregation size */ static int bxe_max_aggregation_size = 0; SYSCTL_INT(_hw_bxe, OID_AUTO, max_aggregation_size, CTLFLAG_TUN, &bxe_max_aggregation_size, 0, "max aggregation size"); /* PCI MRRS: -1 (Auto), 0 (128B), 1 (256B), 2 (512B), 3 (1KB) */ static int bxe_mrrs = -1; SYSCTL_INT(_hw_bxe, OID_AUTO, mrrs, CTLFLAG_RDTUN, &bxe_mrrs, 0, "PCIe maximum read request size"); /* AutoGrEEEn: 0 (hardware default), 1 (force on), 2 (force off) */ static int bxe_autogreeen = 0; SYSCTL_INT(_hw_bxe, OID_AUTO, autogreeen, CTLFLAG_RDTUN, &bxe_autogreeen, 0, "AutoGrEEEn support"); /* 4-tuple RSS support for UDP: 0 (disabled), 1 (enabled) */ static int bxe_udp_rss = 0; SYSCTL_INT(_hw_bxe, OID_AUTO, udp_rss, CTLFLAG_RDTUN, &bxe_udp_rss, 0, "UDP RSS support"); #define STAT_NAME_LEN 32 /* no stat names below can be longer than this */ #define STATS_OFFSET32(stat_name) \ (offsetof(struct bxe_eth_stats, stat_name) / 4) #define Q_STATS_OFFSET32(stat_name) \ (offsetof(struct bxe_eth_q_stats, stat_name) / 4) static const struct { uint32_t offset; uint32_t size; uint32_t flags; #define STATS_FLAGS_PORT 1 #define STATS_FLAGS_FUNC 2 /* MF only cares about function stats */ #define STATS_FLAGS_BOTH (STATS_FLAGS_FUNC | STATS_FLAGS_PORT) char string[STAT_NAME_LEN]; } bxe_eth_stats_arr[] = { { STATS_OFFSET32(total_bytes_received_hi), 8, STATS_FLAGS_BOTH, "rx_bytes" }, { STATS_OFFSET32(error_bytes_received_hi), 8, STATS_FLAGS_BOTH, "rx_error_bytes" }, { STATS_OFFSET32(total_unicast_packets_received_hi), 8, STATS_FLAGS_BOTH, "rx_ucast_packets" }, { STATS_OFFSET32(total_multicast_packets_received_hi), 8, STATS_FLAGS_BOTH, "rx_mcast_packets" }, { STATS_OFFSET32(total_broadcast_packets_received_hi), 8, STATS_FLAGS_BOTH, "rx_bcast_packets" }, { STATS_OFFSET32(rx_stat_dot3statsfcserrors_hi), 8, STATS_FLAGS_PORT, "rx_crc_errors" }, { STATS_OFFSET32(rx_stat_dot3statsalignmenterrors_hi), 8, STATS_FLAGS_PORT, "rx_align_errors" }, { STATS_OFFSET32(rx_stat_etherstatsundersizepkts_hi), 8, STATS_FLAGS_PORT, "rx_undersize_packets" }, { STATS_OFFSET32(etherstatsoverrsizepkts_hi), 8, STATS_FLAGS_PORT, "rx_oversize_packets" }, { STATS_OFFSET32(rx_stat_etherstatsfragments_hi), 8, STATS_FLAGS_PORT, "rx_fragments" }, { STATS_OFFSET32(rx_stat_etherstatsjabbers_hi), 8, STATS_FLAGS_PORT, "rx_jabbers" }, { STATS_OFFSET32(no_buff_discard_hi), 8, STATS_FLAGS_BOTH, "rx_discards" }, { STATS_OFFSET32(mac_filter_discard), 4, STATS_FLAGS_PORT, "rx_filtered_packets" }, { STATS_OFFSET32(mf_tag_discard), 4, STATS_FLAGS_PORT, "rx_mf_tag_discard" }, { STATS_OFFSET32(pfc_frames_received_hi), 8, STATS_FLAGS_PORT, "pfc_frames_received" }, { STATS_OFFSET32(pfc_frames_sent_hi), 8, STATS_FLAGS_PORT, "pfc_frames_sent" }, { STATS_OFFSET32(brb_drop_hi), 8, STATS_FLAGS_PORT, "rx_brb_discard" }, { STATS_OFFSET32(brb_truncate_hi), 8, STATS_FLAGS_PORT, "rx_brb_truncate" }, { STATS_OFFSET32(pause_frames_received_hi), 8, STATS_FLAGS_PORT, "rx_pause_frames" }, { STATS_OFFSET32(rx_stat_maccontrolframesreceived_hi), 8, STATS_FLAGS_PORT, "rx_mac_ctrl_frames" }, { STATS_OFFSET32(nig_timer_max), 4, STATS_FLAGS_PORT, "rx_constant_pause_events" }, { STATS_OFFSET32(total_bytes_transmitted_hi), 8, STATS_FLAGS_BOTH, "tx_bytes" }, { STATS_OFFSET32(tx_stat_ifhcoutbadoctets_hi), 8, STATS_FLAGS_PORT, "tx_error_bytes" }, { STATS_OFFSET32(total_unicast_packets_transmitted_hi), 8, STATS_FLAGS_BOTH, "tx_ucast_packets" }, { STATS_OFFSET32(total_multicast_packets_transmitted_hi), 8, STATS_FLAGS_BOTH, "tx_mcast_packets" }, { STATS_OFFSET32(total_broadcast_packets_transmitted_hi), 8, STATS_FLAGS_BOTH, "tx_bcast_packets" }, { STATS_OFFSET32(tx_stat_dot3statsinternalmactransmiterrors_hi), 8, STATS_FLAGS_PORT, "tx_mac_errors" }, { STATS_OFFSET32(rx_stat_dot3statscarriersenseerrors_hi), 8, STATS_FLAGS_PORT, "tx_carrier_errors" }, { STATS_OFFSET32(tx_stat_dot3statssinglecollisionframes_hi), 8, STATS_FLAGS_PORT, "tx_single_collisions" }, { STATS_OFFSET32(tx_stat_dot3statsmultiplecollisionframes_hi), 8, STATS_FLAGS_PORT, "tx_multi_collisions" }, { STATS_OFFSET32(tx_stat_dot3statsdeferredtransmissions_hi), 8, STATS_FLAGS_PORT, "tx_deferred" }, { STATS_OFFSET32(tx_stat_dot3statsexcessivecollisions_hi), 8, STATS_FLAGS_PORT, "tx_excess_collisions" }, { STATS_OFFSET32(tx_stat_dot3statslatecollisions_hi), 8, STATS_FLAGS_PORT, "tx_late_collisions" }, { STATS_OFFSET32(tx_stat_etherstatscollisions_hi), 8, STATS_FLAGS_PORT, "tx_total_collisions" }, { STATS_OFFSET32(tx_stat_etherstatspkts64octets_hi), 8, STATS_FLAGS_PORT, "tx_64_byte_packets" }, { STATS_OFFSET32(tx_stat_etherstatspkts65octetsto127octets_hi), 8, STATS_FLAGS_PORT, "tx_65_to_127_byte_packets" }, { STATS_OFFSET32(tx_stat_etherstatspkts128octetsto255octets_hi), 8, STATS_FLAGS_PORT, "tx_128_to_255_byte_packets" }, { STATS_OFFSET32(tx_stat_etherstatspkts256octetsto511octets_hi), 8, STATS_FLAGS_PORT, "tx_256_to_511_byte_packets" }, { STATS_OFFSET32(tx_stat_etherstatspkts512octetsto1023octets_hi), 8, STATS_FLAGS_PORT, "tx_512_to_1023_byte_packets" }, { STATS_OFFSET32(etherstatspkts1024octetsto1522octets_hi), 8, STATS_FLAGS_PORT, "tx_1024_to_1522_byte_packets" }, { STATS_OFFSET32(etherstatspktsover1522octets_hi), 8, STATS_FLAGS_PORT, "tx_1523_to_9022_byte_packets" }, { STATS_OFFSET32(pause_frames_sent_hi), 8, STATS_FLAGS_PORT, "tx_pause_frames" }, { STATS_OFFSET32(total_tpa_aggregations_hi), 8, STATS_FLAGS_FUNC, "tpa_aggregations" }, { STATS_OFFSET32(total_tpa_aggregated_frames_hi), 8, STATS_FLAGS_FUNC, "tpa_aggregated_frames"}, { STATS_OFFSET32(total_tpa_bytes_hi), 8, STATS_FLAGS_FUNC, "tpa_bytes"}, #if 0 { STATS_OFFSET32(recoverable_error), 4, STATS_FLAGS_FUNC, "recoverable_errors" }, { STATS_OFFSET32(unrecoverable_error), 4, STATS_FLAGS_FUNC, "unrecoverable_errors" }, #endif { STATS_OFFSET32(eee_tx_lpi), 4, STATS_FLAGS_PORT, "eee_tx_lpi"}, { STATS_OFFSET32(rx_calls), 4, STATS_FLAGS_FUNC, "rx_calls"}, { STATS_OFFSET32(rx_pkts), 4, STATS_FLAGS_FUNC, "rx_pkts"}, { STATS_OFFSET32(rx_tpa_pkts), 4, STATS_FLAGS_FUNC, "rx_tpa_pkts"}, { STATS_OFFSET32(rx_soft_errors), 4, STATS_FLAGS_FUNC, "rx_soft_errors"}, { STATS_OFFSET32(rx_hw_csum_errors), 4, STATS_FLAGS_FUNC, "rx_hw_csum_errors"}, { STATS_OFFSET32(rx_ofld_frames_csum_ip), 4, STATS_FLAGS_FUNC, "rx_ofld_frames_csum_ip"}, { STATS_OFFSET32(rx_ofld_frames_csum_tcp_udp), 4, STATS_FLAGS_FUNC, "rx_ofld_frames_csum_tcp_udp"}, { STATS_OFFSET32(rx_budget_reached), 4, STATS_FLAGS_FUNC, "rx_budget_reached"}, { STATS_OFFSET32(tx_pkts), 4, STATS_FLAGS_FUNC, "tx_pkts"}, { STATS_OFFSET32(tx_soft_errors), 4, STATS_FLAGS_FUNC, "tx_soft_errors"}, { STATS_OFFSET32(tx_ofld_frames_csum_ip), 4, STATS_FLAGS_FUNC, "tx_ofld_frames_csum_ip"}, { STATS_OFFSET32(tx_ofld_frames_csum_tcp), 4, STATS_FLAGS_FUNC, "tx_ofld_frames_csum_tcp"}, { STATS_OFFSET32(tx_ofld_frames_csum_udp), 4, STATS_FLAGS_FUNC, "tx_ofld_frames_csum_udp"}, { STATS_OFFSET32(tx_ofld_frames_lso), 4, STATS_FLAGS_FUNC, "tx_ofld_frames_lso"}, { STATS_OFFSET32(tx_ofld_frames_lso_hdr_splits), 4, STATS_FLAGS_FUNC, "tx_ofld_frames_lso_hdr_splits"}, { STATS_OFFSET32(tx_encap_failures), 4, STATS_FLAGS_FUNC, "tx_encap_failures"}, { STATS_OFFSET32(tx_hw_queue_full), 4, STATS_FLAGS_FUNC, "tx_hw_queue_full"}, { STATS_OFFSET32(tx_hw_max_queue_depth), 4, STATS_FLAGS_FUNC, "tx_hw_max_queue_depth"}, { STATS_OFFSET32(tx_dma_mapping_failure), 4, STATS_FLAGS_FUNC, "tx_dma_mapping_failure"}, { STATS_OFFSET32(tx_max_drbr_queue_depth), 4, STATS_FLAGS_FUNC, "tx_max_drbr_queue_depth"}, { STATS_OFFSET32(tx_window_violation_std), 4, STATS_FLAGS_FUNC, "tx_window_violation_std"}, { STATS_OFFSET32(tx_window_violation_tso), 4, STATS_FLAGS_FUNC, "tx_window_violation_tso"}, #if 0 { STATS_OFFSET32(tx_unsupported_tso_request_ipv6), 4, STATS_FLAGS_FUNC, "tx_unsupported_tso_request_ipv6"}, { STATS_OFFSET32(tx_unsupported_tso_request_not_tcp), 4, STATS_FLAGS_FUNC, "tx_unsupported_tso_request_not_tcp"}, #endif { STATS_OFFSET32(tx_chain_lost_mbuf), 4, STATS_FLAGS_FUNC, "tx_chain_lost_mbuf"}, { STATS_OFFSET32(tx_frames_deferred), 4, STATS_FLAGS_FUNC, "tx_frames_deferred"}, { STATS_OFFSET32(tx_queue_xoff), 4, STATS_FLAGS_FUNC, "tx_queue_xoff"}, { STATS_OFFSET32(mbuf_defrag_attempts), 4, STATS_FLAGS_FUNC, "mbuf_defrag_attempts"}, { STATS_OFFSET32(mbuf_defrag_failures), 4, STATS_FLAGS_FUNC, "mbuf_defrag_failures"}, { STATS_OFFSET32(mbuf_rx_bd_alloc_failed), 4, STATS_FLAGS_FUNC, "mbuf_rx_bd_alloc_failed"}, { STATS_OFFSET32(mbuf_rx_bd_mapping_failed), 4, STATS_FLAGS_FUNC, "mbuf_rx_bd_mapping_failed"}, { STATS_OFFSET32(mbuf_rx_tpa_alloc_failed), 4, STATS_FLAGS_FUNC, "mbuf_rx_tpa_alloc_failed"}, { STATS_OFFSET32(mbuf_rx_tpa_mapping_failed), 4, STATS_FLAGS_FUNC, "mbuf_rx_tpa_mapping_failed"}, { STATS_OFFSET32(mbuf_rx_sge_alloc_failed), 4, STATS_FLAGS_FUNC, "mbuf_rx_sge_alloc_failed"}, { STATS_OFFSET32(mbuf_rx_sge_mapping_failed), 4, STATS_FLAGS_FUNC, "mbuf_rx_sge_mapping_failed"}, { STATS_OFFSET32(mbuf_alloc_tx), 4, STATS_FLAGS_FUNC, "mbuf_alloc_tx"}, { STATS_OFFSET32(mbuf_alloc_rx), 4, STATS_FLAGS_FUNC, "mbuf_alloc_rx"}, { STATS_OFFSET32(mbuf_alloc_sge), 4, STATS_FLAGS_FUNC, "mbuf_alloc_sge"}, { STATS_OFFSET32(mbuf_alloc_tpa), 4, STATS_FLAGS_FUNC, "mbuf_alloc_tpa"} }; static const struct { uint32_t offset; uint32_t size; char string[STAT_NAME_LEN]; } bxe_eth_q_stats_arr[] = { { Q_STATS_OFFSET32(total_bytes_received_hi), 8, "rx_bytes" }, { Q_STATS_OFFSET32(total_unicast_packets_received_hi), 8, "rx_ucast_packets" }, { Q_STATS_OFFSET32(total_multicast_packets_received_hi), 8, "rx_mcast_packets" }, { Q_STATS_OFFSET32(total_broadcast_packets_received_hi), 8, "rx_bcast_packets" }, { Q_STATS_OFFSET32(no_buff_discard_hi), 8, "rx_discards" }, { Q_STATS_OFFSET32(total_bytes_transmitted_hi), 8, "tx_bytes" }, { Q_STATS_OFFSET32(total_unicast_packets_transmitted_hi), 8, "tx_ucast_packets" }, { Q_STATS_OFFSET32(total_multicast_packets_transmitted_hi), 8, "tx_mcast_packets" }, { Q_STATS_OFFSET32(total_broadcast_packets_transmitted_hi), 8, "tx_bcast_packets" }, { Q_STATS_OFFSET32(total_tpa_aggregations_hi), 8, "tpa_aggregations" }, { Q_STATS_OFFSET32(total_tpa_aggregated_frames_hi), 8, "tpa_aggregated_frames"}, { Q_STATS_OFFSET32(total_tpa_bytes_hi), 8, "tpa_bytes"}, { Q_STATS_OFFSET32(rx_calls), 4, "rx_calls"}, { Q_STATS_OFFSET32(rx_pkts), 4, "rx_pkts"}, { Q_STATS_OFFSET32(rx_tpa_pkts), 4, "rx_tpa_pkts"}, { Q_STATS_OFFSET32(rx_soft_errors), 4, "rx_soft_errors"}, { Q_STATS_OFFSET32(rx_hw_csum_errors), 4, "rx_hw_csum_errors"}, { Q_STATS_OFFSET32(rx_ofld_frames_csum_ip), 4, "rx_ofld_frames_csum_ip"}, { Q_STATS_OFFSET32(rx_ofld_frames_csum_tcp_udp), 4, "rx_ofld_frames_csum_tcp_udp"}, { Q_STATS_OFFSET32(rx_budget_reached), 4, "rx_budget_reached"}, { Q_STATS_OFFSET32(tx_pkts), 4, "tx_pkts"}, { Q_STATS_OFFSET32(tx_soft_errors), 4, "tx_soft_errors"}, { Q_STATS_OFFSET32(tx_ofld_frames_csum_ip), 4, "tx_ofld_frames_csum_ip"}, { Q_STATS_OFFSET32(tx_ofld_frames_csum_tcp), 4, "tx_ofld_frames_csum_tcp"}, { Q_STATS_OFFSET32(tx_ofld_frames_csum_udp), 4, "tx_ofld_frames_csum_udp"}, { Q_STATS_OFFSET32(tx_ofld_frames_lso), 4, "tx_ofld_frames_lso"}, { Q_STATS_OFFSET32(tx_ofld_frames_lso_hdr_splits), 4, "tx_ofld_frames_lso_hdr_splits"}, { Q_STATS_OFFSET32(tx_encap_failures), 4, "tx_encap_failures"}, { Q_STATS_OFFSET32(tx_hw_queue_full), 4, "tx_hw_queue_full"}, { Q_STATS_OFFSET32(tx_hw_max_queue_depth), 4, "tx_hw_max_queue_depth"}, { Q_STATS_OFFSET32(tx_dma_mapping_failure), 4, "tx_dma_mapping_failure"}, { Q_STATS_OFFSET32(tx_max_drbr_queue_depth), 4, "tx_max_drbr_queue_depth"}, { Q_STATS_OFFSET32(tx_window_violation_std), 4, "tx_window_violation_std"}, { Q_STATS_OFFSET32(tx_window_violation_tso), 4, "tx_window_violation_tso"}, #if 0 { Q_STATS_OFFSET32(tx_unsupported_tso_request_ipv6), 4, "tx_unsupported_tso_request_ipv6"}, { Q_STATS_OFFSET32(tx_unsupported_tso_request_not_tcp), 4, "tx_unsupported_tso_request_not_tcp"}, #endif { Q_STATS_OFFSET32(tx_chain_lost_mbuf), 4, "tx_chain_lost_mbuf"}, { Q_STATS_OFFSET32(tx_frames_deferred), 4, "tx_frames_deferred"}, { Q_STATS_OFFSET32(tx_queue_xoff), 4, "tx_queue_xoff"}, { Q_STATS_OFFSET32(mbuf_defrag_attempts), 4, "mbuf_defrag_attempts"}, { Q_STATS_OFFSET32(mbuf_defrag_failures), 4, "mbuf_defrag_failures"}, { Q_STATS_OFFSET32(mbuf_rx_bd_alloc_failed), 4, "mbuf_rx_bd_alloc_failed"}, { Q_STATS_OFFSET32(mbuf_rx_bd_mapping_failed), 4, "mbuf_rx_bd_mapping_failed"}, { Q_STATS_OFFSET32(mbuf_rx_tpa_alloc_failed), 4, "mbuf_rx_tpa_alloc_failed"}, { Q_STATS_OFFSET32(mbuf_rx_tpa_mapping_failed), 4, "mbuf_rx_tpa_mapping_failed"}, { Q_STATS_OFFSET32(mbuf_rx_sge_alloc_failed), 4, "mbuf_rx_sge_alloc_failed"}, { Q_STATS_OFFSET32(mbuf_rx_sge_mapping_failed), 4, "mbuf_rx_sge_mapping_failed"}, { Q_STATS_OFFSET32(mbuf_alloc_tx), 4, "mbuf_alloc_tx"}, { Q_STATS_OFFSET32(mbuf_alloc_rx), 4, "mbuf_alloc_rx"}, { Q_STATS_OFFSET32(mbuf_alloc_sge), 4, "mbuf_alloc_sge"}, { Q_STATS_OFFSET32(mbuf_alloc_tpa), 4, "mbuf_alloc_tpa"} }; #define BXE_NUM_ETH_STATS ARRAY_SIZE(bxe_eth_stats_arr) #define BXE_NUM_ETH_Q_STATS ARRAY_SIZE(bxe_eth_q_stats_arr) static void bxe_cmng_fns_init(struct bxe_softc *sc, uint8_t read_cfg, uint8_t cmng_type); static int bxe_get_cmng_fns_mode(struct bxe_softc *sc); static void storm_memset_cmng(struct bxe_softc *sc, struct cmng_init *cmng, uint8_t port); static void bxe_set_reset_global(struct bxe_softc *sc); static void bxe_set_reset_in_progress(struct bxe_softc *sc); static uint8_t bxe_reset_is_done(struct bxe_softc *sc, int engine); static uint8_t bxe_clear_pf_load(struct bxe_softc *sc); static uint8_t bxe_chk_parity_attn(struct bxe_softc *sc, uint8_t *global, uint8_t print); static void bxe_int_disable(struct bxe_softc *sc); static int bxe_release_leader_lock(struct bxe_softc *sc); static void bxe_pf_disable(struct bxe_softc *sc); static void bxe_free_fp_buffers(struct bxe_softc *sc); static inline void bxe_update_rx_prod(struct bxe_softc *sc, struct bxe_fastpath *fp, uint16_t rx_bd_prod, uint16_t rx_cq_prod, uint16_t rx_sge_prod); static void bxe_link_report_locked(struct bxe_softc *sc); static void bxe_link_report(struct bxe_softc *sc); static void bxe_link_status_update(struct bxe_softc *sc); static void bxe_periodic_callout_func(void *xsc); static void bxe_periodic_start(struct bxe_softc *sc); static void bxe_periodic_stop(struct bxe_softc *sc); static int bxe_alloc_rx_bd_mbuf(struct bxe_fastpath *fp, uint16_t prev_index, uint16_t index); static int bxe_alloc_rx_tpa_mbuf(struct bxe_fastpath *fp, int queue); static int bxe_alloc_rx_sge_mbuf(struct bxe_fastpath *fp, uint16_t index); static uint8_t bxe_txeof(struct bxe_softc *sc, struct bxe_fastpath *fp); static void bxe_task_fp(struct bxe_fastpath *fp); static __noinline void bxe_dump_mbuf(struct bxe_softc *sc, struct mbuf *m, uint8_t contents); static int bxe_alloc_mem(struct bxe_softc *sc); static void bxe_free_mem(struct bxe_softc *sc); static int bxe_alloc_fw_stats_mem(struct bxe_softc *sc); static void bxe_free_fw_stats_mem(struct bxe_softc *sc); static int bxe_interrupt_attach(struct bxe_softc *sc); static void bxe_interrupt_detach(struct bxe_softc *sc); static void bxe_set_rx_mode(struct bxe_softc *sc); static int bxe_init_locked(struct bxe_softc *sc); static int bxe_stop_locked(struct bxe_softc *sc); static __noinline int bxe_nic_load(struct bxe_softc *sc, int load_mode); static __noinline int bxe_nic_unload(struct bxe_softc *sc, uint32_t unload_mode, uint8_t keep_link); static void bxe_handle_sp_tq(void *context, int pending); static void bxe_handle_rx_mode_tq(void *context, int pending); static void bxe_handle_fp_tq(void *context, int pending); /* calculate crc32 on a buffer (NOTE: crc32_length MUST be aligned to 8) */ uint32_t calc_crc32(uint8_t *crc32_packet, uint32_t crc32_length, uint32_t crc32_seed, uint8_t complement) { uint32_t byte = 0; uint32_t bit = 0; uint8_t msb = 0; uint32_t temp = 0; uint32_t shft = 0; uint8_t current_byte = 0; uint32_t crc32_result = crc32_seed; const uint32_t CRC32_POLY = 0x1edc6f41; if ((crc32_packet == NULL) || (crc32_length == 0) || ((crc32_length % 8) != 0)) { return (crc32_result); } for (byte = 0; byte < crc32_length; byte = byte + 1) { current_byte = crc32_packet[byte]; for (bit = 0; bit < 8; bit = bit + 1) { /* msb = crc32_result[31]; */ msb = (uint8_t)(crc32_result >> 31); crc32_result = crc32_result << 1; /* it (msb != current_byte[bit]) */ if (msb != (0x1 & (current_byte >> bit))) { crc32_result = crc32_result ^ CRC32_POLY; /* crc32_result[0] = 1 */ crc32_result |= 1; } } } /* Last step is to: * 1. "mirror" every bit * 2. swap the 4 bytes * 3. complement each bit */ /* Mirror */ temp = crc32_result; shft = sizeof(crc32_result) * 8 - 1; for (crc32_result >>= 1; crc32_result; crc32_result >>= 1) { temp <<= 1; temp |= crc32_result & 1; shft-- ; } /* temp[31-bit] = crc32_result[bit] */ temp <<= shft; /* Swap */ /* crc32_result = {temp[7:0], temp[15:8], temp[23:16], temp[31:24]} */ { uint32_t t0, t1, t2, t3; t0 = (0x000000ff & (temp >> 24)); t1 = (0x0000ff00 & (temp >> 8)); t2 = (0x00ff0000 & (temp << 8)); t3 = (0xff000000 & (temp << 24)); crc32_result = t0 | t1 | t2 | t3; } /* Complement */ if (complement) { crc32_result = ~crc32_result; } return (crc32_result); } int bxe_test_bit(int nr, volatile unsigned long *addr) { return ((atomic_load_acq_long(addr) & (1 << nr)) != 0); } void bxe_set_bit(unsigned int nr, volatile unsigned long *addr) { atomic_set_acq_long(addr, (1 << nr)); } void bxe_clear_bit(int nr, volatile unsigned long *addr) { atomic_clear_acq_long(addr, (1 << nr)); } int bxe_test_and_set_bit(int nr, volatile unsigned long *addr) { unsigned long x; nr = (1 << nr); do { x = *addr; } while (atomic_cmpset_acq_long(addr, x, x | nr) == 0); // if (x & nr) bit_was_set; else bit_was_not_set; return (x & nr); } int bxe_test_and_clear_bit(int nr, volatile unsigned long *addr) { unsigned long x; nr = (1 << nr); do { x = *addr; } while (atomic_cmpset_acq_long(addr, x, x & ~nr) == 0); // if (x & nr) bit_was_set; else bit_was_not_set; return (x & nr); } int bxe_cmpxchg(volatile int *addr, int old, int new) { int x; do { x = *addr; } while (atomic_cmpset_acq_int(addr, old, new) == 0); return (x); } /* * Get DMA memory from the OS. * * Validates that the OS has provided DMA buffers in response to a * bus_dmamap_load call and saves the physical address of those buffers. * When the callback is used the OS will return 0 for the mapping function * (bus_dmamap_load) so we use the value of map_arg->maxsegs to pass any * failures back to the caller. * * Returns: * Nothing. */ static void bxe_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct bxe_dma *dma = arg; if (error) { dma->paddr = 0; dma->nseg = 0; BLOGE(dma->sc, "Failed DMA alloc '%s' (%d)!\n", dma->msg, error); } else { dma->paddr = segs->ds_addr; dma->nseg = nseg; #if 0 BLOGD(dma->sc, DBG_LOAD, "DMA alloc '%s': vaddr=%p paddr=%p nseg=%d size=%lu\n", dma->msg, dma->vaddr, (void *)dma->paddr, dma->nseg, dma->size); #endif } } /* * Allocate a block of memory and map it for DMA. No partial completions * allowed and release any resources acquired if we can't acquire all * resources. * * Returns: * 0 = Success, !0 = Failure */ int bxe_dma_alloc(struct bxe_softc *sc, bus_size_t size, struct bxe_dma *dma, const char *msg) { int rc; if (dma->size > 0) { BLOGE(sc, "dma block '%s' already has size %lu\n", msg, (unsigned long)dma->size); return (1); } memset(dma, 0, sizeof(*dma)); /* sanity */ dma->sc = sc; dma->size = size; snprintf(dma->msg, sizeof(dma->msg), "%s", msg); rc = bus_dma_tag_create(sc->parent_dma_tag, /* parent tag */ BCM_PAGE_SIZE, /* alignment */ 0, /* boundary limit */ BUS_SPACE_MAXADDR, /* restricted low */ BUS_SPACE_MAXADDR, /* restricted hi */ NULL, /* addr filter() */ NULL, /* addr filter() arg */ size, /* max map size */ 1, /* num discontinuous */ size, /* max seg size */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lock() */ NULL, /* lock() arg */ &dma->tag); /* returned dma tag */ if (rc != 0) { BLOGE(sc, "Failed to create dma tag for '%s' (%d)\n", msg, rc); memset(dma, 0, sizeof(*dma)); return (1); } rc = bus_dmamem_alloc(dma->tag, (void **)&dma->vaddr, (BUS_DMA_NOWAIT | BUS_DMA_ZERO), &dma->map); if (rc != 0) { BLOGE(sc, "Failed to alloc dma mem for '%s' (%d)\n", msg, rc); bus_dma_tag_destroy(dma->tag); memset(dma, 0, sizeof(*dma)); return (1); } rc = bus_dmamap_load(dma->tag, dma->map, dma->vaddr, size, bxe_dma_map_addr, /* BLOGD in here */ dma, BUS_DMA_NOWAIT); if (rc != 0) { BLOGE(sc, "Failed to load dma map for '%s' (%d)\n", msg, rc); bus_dmamem_free(dma->tag, dma->vaddr, dma->map); bus_dma_tag_destroy(dma->tag); memset(dma, 0, sizeof(*dma)); return (1); } return (0); } void bxe_dma_free(struct bxe_softc *sc, struct bxe_dma *dma) { if (dma->size > 0) { #if 0 BLOGD(sc, DBG_LOAD, "DMA free '%s': vaddr=%p paddr=%p nseg=%d size=%lu\n", dma->msg, dma->vaddr, (void *)dma->paddr, dma->nseg, dma->size); #endif DBASSERT(sc, (dma->tag != NULL), ("dma tag is NULL")); bus_dmamap_sync(dma->tag, dma->map, (BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE)); bus_dmamap_unload(dma->tag, dma->map); bus_dmamem_free(dma->tag, dma->vaddr, dma->map); bus_dma_tag_destroy(dma->tag); } memset(dma, 0, sizeof(*dma)); } /* * These indirect read and write routines are only during init. * The locking is handled by the MCP. */ void bxe_reg_wr_ind(struct bxe_softc *sc, uint32_t addr, uint32_t val) { pci_write_config(sc->dev, PCICFG_GRC_ADDRESS, addr, 4); pci_write_config(sc->dev, PCICFG_GRC_DATA, val, 4); pci_write_config(sc->dev, PCICFG_GRC_ADDRESS, 0, 4); } uint32_t bxe_reg_rd_ind(struct bxe_softc *sc, uint32_t addr) { uint32_t val; pci_write_config(sc->dev, PCICFG_GRC_ADDRESS, addr, 4); val = pci_read_config(sc->dev, PCICFG_GRC_DATA, 4); pci_write_config(sc->dev, PCICFG_GRC_ADDRESS, 0, 4); return (val); } #if 0 void bxe_dp_dmae(struct bxe_softc *sc, struct dmae_command *dmae, int msglvl) { uint32_t src_type = dmae->opcode & DMAE_COMMAND_SRC; switch (dmae->opcode & DMAE_COMMAND_DST) { case DMAE_CMD_DST_PCI: if (src_type == DMAE_CMD_SRC_PCI) DP(msglvl, "DMAE: opcode 0x%08x\n" "src [%x:%08x], len [%d*4], dst [%x:%08x]\n" "comp_addr [%x:%08x], comp_val 0x%08x\n", dmae->opcode, dmae->src_addr_hi, dmae->src_addr_lo, dmae->len, dmae->dst_addr_hi, dmae->dst_addr_lo, dmae->comp_addr_hi, dmae->comp_addr_lo, dmae->comp_val); else DP(msglvl, "DMAE: opcode 0x%08x\n" "src [%08x], len [%d*4], dst [%x:%08x]\n" "comp_addr [%x:%08x], comp_val 0x%08x\n", dmae->opcode, dmae->src_addr_lo >> 2, dmae->len, dmae->dst_addr_hi, dmae->dst_addr_lo, dmae->comp_addr_hi, dmae->comp_addr_lo, dmae->comp_val); break; case DMAE_CMD_DST_GRC: if (src_type == DMAE_CMD_SRC_PCI) DP(msglvl, "DMAE: opcode 0x%08x\n" "src [%x:%08x], len [%d*4], dst_addr [%08x]\n" "comp_addr [%x:%08x], comp_val 0x%08x\n", dmae->opcode, dmae->src_addr_hi, dmae->src_addr_lo, dmae->len, dmae->dst_addr_lo >> 2, dmae->comp_addr_hi, dmae->comp_addr_lo, dmae->comp_val); else DP(msglvl, "DMAE: opcode 0x%08x\n" "src [%08x], len [%d*4], dst [%08x]\n" "comp_addr [%x:%08x], comp_val 0x%08x\n", dmae->opcode, dmae->src_addr_lo >> 2, dmae->len, dmae->dst_addr_lo >> 2, dmae->comp_addr_hi, dmae->comp_addr_lo, dmae->comp_val); break; default: if (src_type == DMAE_CMD_SRC_PCI) DP(msglvl, "DMAE: opcode 0x%08x\n" "src_addr [%x:%08x] len [%d * 4] dst_addr [none]\n" "comp_addr [%x:%08x] comp_val 0x%08x\n", dmae->opcode, dmae->src_addr_hi, dmae->src_addr_lo, dmae->len, dmae->comp_addr_hi, dmae->comp_addr_lo, dmae->comp_val); else DP(msglvl, "DMAE: opcode 0x%08x\n" "src_addr [%08x] len [%d * 4] dst_addr [none]\n" "comp_addr [%x:%08x] comp_val 0x%08x\n", dmae->opcode, dmae->src_addr_lo >> 2, dmae->len, dmae->comp_addr_hi, dmae->comp_addr_lo, dmae->comp_val); break; } } #endif static int bxe_acquire_hw_lock(struct bxe_softc *sc, uint32_t resource) { uint32_t lock_status; uint32_t resource_bit = (1 << resource); int func = SC_FUNC(sc); uint32_t hw_lock_control_reg; int cnt; /* validate the resource is within range */ if (resource > HW_LOCK_MAX_RESOURCE_VALUE) { BLOGE(sc, "resource 0x%x > HW_LOCK_MAX_RESOURCE_VALUE\n", resource); return (-1); } if (func <= 5) { hw_lock_control_reg = (MISC_REG_DRIVER_CONTROL_1 + (func * 8)); } else { hw_lock_control_reg = (MISC_REG_DRIVER_CONTROL_7 + ((func - 6) * 8)); } /* validate the resource is not already taken */ lock_status = REG_RD(sc, hw_lock_control_reg); if (lock_status & resource_bit) { BLOGE(sc, "resource in use (status 0x%x bit 0x%x)\n", lock_status, resource_bit); return (-1); } /* try every 5ms for 5 seconds */ for (cnt = 0; cnt < 1000; cnt++) { REG_WR(sc, (hw_lock_control_reg + 4), resource_bit); lock_status = REG_RD(sc, hw_lock_control_reg); if (lock_status & resource_bit) { return (0); } DELAY(5000); } BLOGE(sc, "Resource lock timeout!\n"); return (-1); } static int bxe_release_hw_lock(struct bxe_softc *sc, uint32_t resource) { uint32_t lock_status; uint32_t resource_bit = (1 << resource); int func = SC_FUNC(sc); uint32_t hw_lock_control_reg; /* validate the resource is within range */ if (resource > HW_LOCK_MAX_RESOURCE_VALUE) { BLOGE(sc, "resource 0x%x > HW_LOCK_MAX_RESOURCE_VALUE\n", resource); return (-1); } if (func <= 5) { hw_lock_control_reg = (MISC_REG_DRIVER_CONTROL_1 + (func * 8)); } else { hw_lock_control_reg = (MISC_REG_DRIVER_CONTROL_7 + ((func - 6) * 8)); } /* validate the resource is currently taken */ lock_status = REG_RD(sc, hw_lock_control_reg); if (!(lock_status & resource_bit)) { BLOGE(sc, "resource not in use (status 0x%x bit 0x%x)\n", lock_status, resource_bit); return (-1); } REG_WR(sc, hw_lock_control_reg, resource_bit); return (0); } /* * Per pf misc lock must be acquired before the per port mcp lock. Otherwise, * had we done things the other way around, if two pfs from the same port * would attempt to access nvram at the same time, we could run into a * scenario such as: * pf A takes the port lock. * pf B succeeds in taking the same lock since they are from the same port. * pf A takes the per pf misc lock. Performs eeprom access. * pf A finishes. Unlocks the per pf misc lock. * Pf B takes the lock and proceeds to perform it's own access. * pf A unlocks the per port lock, while pf B is still working (!). * mcp takes the per port lock and corrupts pf B's access (and/or has it's own * access corrupted by pf B).* */ static int bxe_acquire_nvram_lock(struct bxe_softc *sc) { int port = SC_PORT(sc); int count, i; uint32_t val = 0; /* acquire HW lock: protect against other PFs in PF Direct Assignment */ bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_NVRAM); /* adjust timeout for emulation/FPGA */ count = NVRAM_TIMEOUT_COUNT; if (CHIP_REV_IS_SLOW(sc)) { count *= 100; } /* request access to nvram interface */ REG_WR(sc, MCP_REG_MCPR_NVM_SW_ARB, (MCPR_NVM_SW_ARB_ARB_REQ_SET1 << port)); for (i = 0; i < count*10; i++) { val = REG_RD(sc, MCP_REG_MCPR_NVM_SW_ARB); if (val & (MCPR_NVM_SW_ARB_ARB_ARB1 << port)) { break; } DELAY(5); } if (!(val & (MCPR_NVM_SW_ARB_ARB_ARB1 << port))) { BLOGE(sc, "Cannot get access to nvram interface\n"); return (-1); } return (0); } static int bxe_release_nvram_lock(struct bxe_softc *sc) { int port = SC_PORT(sc); int count, i; uint32_t val = 0; /* adjust timeout for emulation/FPGA */ count = NVRAM_TIMEOUT_COUNT; if (CHIP_REV_IS_SLOW(sc)) { count *= 100; } /* relinquish nvram interface */ REG_WR(sc, MCP_REG_MCPR_NVM_SW_ARB, (MCPR_NVM_SW_ARB_ARB_REQ_CLR1 << port)); for (i = 0; i < count*10; i++) { val = REG_RD(sc, MCP_REG_MCPR_NVM_SW_ARB); if (!(val & (MCPR_NVM_SW_ARB_ARB_ARB1 << port))) { break; } DELAY(5); } if (val & (MCPR_NVM_SW_ARB_ARB_ARB1 << port)) { BLOGE(sc, "Cannot free access to nvram interface\n"); return (-1); } /* release HW lock: protect against other PFs in PF Direct Assignment */ bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_NVRAM); return (0); } static void bxe_enable_nvram_access(struct bxe_softc *sc) { uint32_t val; val = REG_RD(sc, MCP_REG_MCPR_NVM_ACCESS_ENABLE); /* enable both bits, even on read */ REG_WR(sc, MCP_REG_MCPR_NVM_ACCESS_ENABLE, (val | MCPR_NVM_ACCESS_ENABLE_EN | MCPR_NVM_ACCESS_ENABLE_WR_EN)); } static void bxe_disable_nvram_access(struct bxe_softc *sc) { uint32_t val; val = REG_RD(sc, MCP_REG_MCPR_NVM_ACCESS_ENABLE); /* disable both bits, even after read */ REG_WR(sc, MCP_REG_MCPR_NVM_ACCESS_ENABLE, (val & ~(MCPR_NVM_ACCESS_ENABLE_EN | MCPR_NVM_ACCESS_ENABLE_WR_EN))); } static int bxe_nvram_read_dword(struct bxe_softc *sc, uint32_t offset, uint32_t *ret_val, uint32_t cmd_flags) { int count, i, rc; uint32_t val; /* build the command word */ cmd_flags |= MCPR_NVM_COMMAND_DOIT; /* need to clear DONE bit separately */ REG_WR(sc, MCP_REG_MCPR_NVM_COMMAND, MCPR_NVM_COMMAND_DONE); /* address of the NVRAM to read from */ REG_WR(sc, MCP_REG_MCPR_NVM_ADDR, (offset & MCPR_NVM_ADDR_NVM_ADDR_VALUE)); /* issue a read command */ REG_WR(sc, MCP_REG_MCPR_NVM_COMMAND, cmd_flags); /* adjust timeout for emulation/FPGA */ count = NVRAM_TIMEOUT_COUNT; if (CHIP_REV_IS_SLOW(sc)) { count *= 100; } /* wait for completion */ *ret_val = 0; rc = -1; for (i = 0; i < count; i++) { DELAY(5); val = REG_RD(sc, MCP_REG_MCPR_NVM_COMMAND); if (val & MCPR_NVM_COMMAND_DONE) { val = REG_RD(sc, MCP_REG_MCPR_NVM_READ); /* we read nvram data in cpu order * but ethtool sees it as an array of bytes * converting to big-endian will do the work */ *ret_val = htobe32(val); rc = 0; break; } } if (rc == -1) { BLOGE(sc, "nvram read timeout expired\n"); } return (rc); } static int bxe_nvram_read(struct bxe_softc *sc, uint32_t offset, uint8_t *ret_buf, int buf_size) { uint32_t cmd_flags; uint32_t val; int rc; if ((offset & 0x03) || (buf_size & 0x03) || (buf_size == 0)) { BLOGE(sc, "Invalid parameter, offset 0x%x buf_size 0x%x\n", offset, buf_size); return (-1); } if ((offset + buf_size) > sc->devinfo.flash_size) { BLOGE(sc, "Invalid parameter, " "offset 0x%x + buf_size 0x%x > flash_size 0x%x\n", offset, buf_size, sc->devinfo.flash_size); return (-1); } /* request access to nvram interface */ rc = bxe_acquire_nvram_lock(sc); if (rc) { return (rc); } /* enable access to nvram interface */ bxe_enable_nvram_access(sc); /* read the first word(s) */ cmd_flags = MCPR_NVM_COMMAND_FIRST; while ((buf_size > sizeof(uint32_t)) && (rc == 0)) { rc = bxe_nvram_read_dword(sc, offset, &val, cmd_flags); memcpy(ret_buf, &val, 4); /* advance to the next dword */ offset += sizeof(uint32_t); ret_buf += sizeof(uint32_t); buf_size -= sizeof(uint32_t); cmd_flags = 0; } if (rc == 0) { cmd_flags |= MCPR_NVM_COMMAND_LAST; rc = bxe_nvram_read_dword(sc, offset, &val, cmd_flags); memcpy(ret_buf, &val, 4); } /* disable access to nvram interface */ bxe_disable_nvram_access(sc); bxe_release_nvram_lock(sc); return (rc); } static int bxe_nvram_write_dword(struct bxe_softc *sc, uint32_t offset, uint32_t val, uint32_t cmd_flags) { int count, i, rc; /* build the command word */ cmd_flags |= (MCPR_NVM_COMMAND_DOIT | MCPR_NVM_COMMAND_WR); /* need to clear DONE bit separately */ REG_WR(sc, MCP_REG_MCPR_NVM_COMMAND, MCPR_NVM_COMMAND_DONE); /* write the data */ REG_WR(sc, MCP_REG_MCPR_NVM_WRITE, val); /* address of the NVRAM to write to */ REG_WR(sc, MCP_REG_MCPR_NVM_ADDR, (offset & MCPR_NVM_ADDR_NVM_ADDR_VALUE)); /* issue the write command */ REG_WR(sc, MCP_REG_MCPR_NVM_COMMAND, cmd_flags); /* adjust timeout for emulation/FPGA */ count = NVRAM_TIMEOUT_COUNT; if (CHIP_REV_IS_SLOW(sc)) { count *= 100; } /* wait for completion */ rc = -1; for (i = 0; i < count; i++) { DELAY(5); val = REG_RD(sc, MCP_REG_MCPR_NVM_COMMAND); if (val & MCPR_NVM_COMMAND_DONE) { rc = 0; break; } } if (rc == -1) { BLOGE(sc, "nvram write timeout expired\n"); } return (rc); } #define BYTE_OFFSET(offset) (8 * (offset & 0x03)) static int bxe_nvram_write1(struct bxe_softc *sc, uint32_t offset, uint8_t *data_buf, int buf_size) { uint32_t cmd_flags; uint32_t align_offset; uint32_t val; int rc; if ((offset + buf_size) > sc->devinfo.flash_size) { BLOGE(sc, "Invalid parameter, " "offset 0x%x + buf_size 0x%x > flash_size 0x%x\n", offset, buf_size, sc->devinfo.flash_size); return (-1); } /* request access to nvram interface */ rc = bxe_acquire_nvram_lock(sc); if (rc) { return (rc); } /* enable access to nvram interface */ bxe_enable_nvram_access(sc); cmd_flags = (MCPR_NVM_COMMAND_FIRST | MCPR_NVM_COMMAND_LAST); align_offset = (offset & ~0x03); rc = bxe_nvram_read_dword(sc, align_offset, &val, cmd_flags); if (rc == 0) { val &= ~(0xff << BYTE_OFFSET(offset)); val |= (*data_buf << BYTE_OFFSET(offset)); /* nvram data is returned as an array of bytes * convert it back to cpu order */ val = be32toh(val); rc = bxe_nvram_write_dword(sc, align_offset, val, cmd_flags); } /* disable access to nvram interface */ bxe_disable_nvram_access(sc); bxe_release_nvram_lock(sc); return (rc); } static int bxe_nvram_write(struct bxe_softc *sc, uint32_t offset, uint8_t *data_buf, int buf_size) { uint32_t cmd_flags; uint32_t val; uint32_t written_so_far; int rc; if (buf_size == 1) { return (bxe_nvram_write1(sc, offset, data_buf, buf_size)); } if ((offset & 0x03) || (buf_size & 0x03) /* || (buf_size == 0) */) { BLOGE(sc, "Invalid parameter, offset 0x%x buf_size 0x%x\n", offset, buf_size); return (-1); } if (buf_size == 0) { return (0); /* nothing to do */ } if ((offset + buf_size) > sc->devinfo.flash_size) { BLOGE(sc, "Invalid parameter, " "offset 0x%x + buf_size 0x%x > flash_size 0x%x\n", offset, buf_size, sc->devinfo.flash_size); return (-1); } /* request access to nvram interface */ rc = bxe_acquire_nvram_lock(sc); if (rc) { return (rc); } /* enable access to nvram interface */ bxe_enable_nvram_access(sc); written_so_far = 0; cmd_flags = MCPR_NVM_COMMAND_FIRST; while ((written_so_far < buf_size) && (rc == 0)) { if (written_so_far == (buf_size - sizeof(uint32_t))) { cmd_flags |= MCPR_NVM_COMMAND_LAST; } else if (((offset + 4) % NVRAM_PAGE_SIZE) == 0) { cmd_flags |= MCPR_NVM_COMMAND_LAST; } else if ((offset % NVRAM_PAGE_SIZE) == 0) { cmd_flags |= MCPR_NVM_COMMAND_FIRST; } memcpy(&val, data_buf, 4); rc = bxe_nvram_write_dword(sc, offset, val, cmd_flags); /* advance to the next dword */ offset += sizeof(uint32_t); data_buf += sizeof(uint32_t); written_so_far += sizeof(uint32_t); cmd_flags = 0; } /* disable access to nvram interface */ bxe_disable_nvram_access(sc); bxe_release_nvram_lock(sc); return (rc); } /* copy command into DMAE command memory and set DMAE command Go */ void bxe_post_dmae(struct bxe_softc *sc, struct dmae_command *dmae, int idx) { uint32_t cmd_offset; int i; cmd_offset = (DMAE_REG_CMD_MEM + (sizeof(struct dmae_command) * idx)); for (i = 0; i < ((sizeof(struct dmae_command) / 4)); i++) { REG_WR(sc, (cmd_offset + (i * 4)), *(((uint32_t *)dmae) + i)); } REG_WR(sc, dmae_reg_go_c[idx], 1); } uint32_t bxe_dmae_opcode_add_comp(uint32_t opcode, uint8_t comp_type) { return (opcode | ((comp_type << DMAE_COMMAND_C_DST_SHIFT) | DMAE_COMMAND_C_TYPE_ENABLE)); } uint32_t bxe_dmae_opcode_clr_src_reset(uint32_t opcode) { return (opcode & ~DMAE_COMMAND_SRC_RESET); } uint32_t bxe_dmae_opcode(struct bxe_softc *sc, uint8_t src_type, uint8_t dst_type, uint8_t with_comp, uint8_t comp_type) { uint32_t opcode = 0; opcode |= ((src_type << DMAE_COMMAND_SRC_SHIFT) | (dst_type << DMAE_COMMAND_DST_SHIFT)); opcode |= (DMAE_COMMAND_SRC_RESET | DMAE_COMMAND_DST_RESET); opcode |= (SC_PORT(sc) ? DMAE_CMD_PORT_1 : DMAE_CMD_PORT_0); opcode |= ((SC_VN(sc) << DMAE_COMMAND_E1HVN_SHIFT) | (SC_VN(sc) << DMAE_COMMAND_DST_VN_SHIFT)); opcode |= (DMAE_COM_SET_ERR << DMAE_COMMAND_ERR_POLICY_SHIFT); #ifdef __BIG_ENDIAN opcode |= DMAE_CMD_ENDIANITY_B_DW_SWAP; #else opcode |= DMAE_CMD_ENDIANITY_DW_SWAP; #endif if (with_comp) { opcode = bxe_dmae_opcode_add_comp(opcode, comp_type); } return (opcode); } static void bxe_prep_dmae_with_comp(struct bxe_softc *sc, struct dmae_command *dmae, uint8_t src_type, uint8_t dst_type) { memset(dmae, 0, sizeof(struct dmae_command)); /* set the opcode */ dmae->opcode = bxe_dmae_opcode(sc, src_type, dst_type, TRUE, DMAE_COMP_PCI); /* fill in the completion parameters */ dmae->comp_addr_lo = U64_LO(BXE_SP_MAPPING(sc, wb_comp)); dmae->comp_addr_hi = U64_HI(BXE_SP_MAPPING(sc, wb_comp)); dmae->comp_val = DMAE_COMP_VAL; } /* issue a DMAE command over the init channel and wait for completion */ static int bxe_issue_dmae_with_comp(struct bxe_softc *sc, struct dmae_command *dmae) { uint32_t *wb_comp = BXE_SP(sc, wb_comp); int timeout = CHIP_REV_IS_SLOW(sc) ? 400000 : 4000; BXE_DMAE_LOCK(sc); /* reset completion */ *wb_comp = 0; /* post the command on the channel used for initializations */ bxe_post_dmae(sc, dmae, INIT_DMAE_C(sc)); /* wait for completion */ DELAY(5); while ((*wb_comp & ~DMAE_PCI_ERR_FLAG) != DMAE_COMP_VAL) { if (!timeout || (sc->recovery_state != BXE_RECOVERY_DONE && sc->recovery_state != BXE_RECOVERY_NIC_LOADING)) { BLOGE(sc, "DMAE timeout!\n"); BXE_DMAE_UNLOCK(sc); return (DMAE_TIMEOUT); } timeout--; DELAY(50); } if (*wb_comp & DMAE_PCI_ERR_FLAG) { BLOGE(sc, "DMAE PCI error!\n"); BXE_DMAE_UNLOCK(sc); return (DMAE_PCI_ERROR); } BXE_DMAE_UNLOCK(sc); return (0); } void bxe_read_dmae(struct bxe_softc *sc, uint32_t src_addr, uint32_t len32) { struct dmae_command dmae; uint32_t *data; int i, rc; DBASSERT(sc, (len32 <= 4), ("DMAE read length is %d", len32)); if (!sc->dmae_ready) { data = BXE_SP(sc, wb_data[0]); for (i = 0; i < len32; i++) { data[i] = (CHIP_IS_E1(sc)) ? bxe_reg_rd_ind(sc, (src_addr + (i * 4))) : REG_RD(sc, (src_addr + (i * 4))); } return; } /* set opcode and fixed command fields */ bxe_prep_dmae_with_comp(sc, &dmae, DMAE_SRC_GRC, DMAE_DST_PCI); /* fill in addresses and len */ dmae.src_addr_lo = (src_addr >> 2); /* GRC addr has dword resolution */ dmae.src_addr_hi = 0; dmae.dst_addr_lo = U64_LO(BXE_SP_MAPPING(sc, wb_data)); dmae.dst_addr_hi = U64_HI(BXE_SP_MAPPING(sc, wb_data)); dmae.len = len32; /* issue the command and wait for completion */ if ((rc = bxe_issue_dmae_with_comp(sc, &dmae)) != 0) { bxe_panic(sc, ("DMAE failed (%d)\n", rc)); }; } void bxe_write_dmae(struct bxe_softc *sc, bus_addr_t dma_addr, uint32_t dst_addr, uint32_t len32) { struct dmae_command dmae; int rc; if (!sc->dmae_ready) { DBASSERT(sc, (len32 <= 4), ("DMAE not ready and length is %d", len32)); if (CHIP_IS_E1(sc)) { ecore_init_ind_wr(sc, dst_addr, BXE_SP(sc, wb_data[0]), len32); } else { ecore_init_str_wr(sc, dst_addr, BXE_SP(sc, wb_data[0]), len32); } return; } /* set opcode and fixed command fields */ bxe_prep_dmae_with_comp(sc, &dmae, DMAE_SRC_PCI, DMAE_DST_GRC); /* fill in addresses and len */ dmae.src_addr_lo = U64_LO(dma_addr); dmae.src_addr_hi = U64_HI(dma_addr); dmae.dst_addr_lo = (dst_addr >> 2); /* GRC addr has dword resolution */ dmae.dst_addr_hi = 0; dmae.len = len32; /* issue the command and wait for completion */ if ((rc = bxe_issue_dmae_with_comp(sc, &dmae)) != 0) { bxe_panic(sc, ("DMAE failed (%d)\n", rc)); } } void bxe_write_dmae_phys_len(struct bxe_softc *sc, bus_addr_t phys_addr, uint32_t addr, uint32_t len) { int dmae_wr_max = DMAE_LEN32_WR_MAX(sc); int offset = 0; while (len > dmae_wr_max) { bxe_write_dmae(sc, (phys_addr + offset), /* src DMA address */ (addr + offset), /* dst GRC address */ dmae_wr_max); offset += (dmae_wr_max * 4); len -= dmae_wr_max; } bxe_write_dmae(sc, (phys_addr + offset), /* src DMA address */ (addr + offset), /* dst GRC address */ len); } void bxe_set_ctx_validation(struct bxe_softc *sc, struct eth_context *cxt, uint32_t cid) { /* ustorm cxt validation */ cxt->ustorm_ag_context.cdu_usage = CDU_RSRVD_VALUE_TYPE_A(HW_CID(sc, cid), CDU_REGION_NUMBER_UCM_AG, ETH_CONNECTION_TYPE); /* xcontext validation */ cxt->xstorm_ag_context.cdu_reserved = CDU_RSRVD_VALUE_TYPE_A(HW_CID(sc, cid), CDU_REGION_NUMBER_XCM_AG, ETH_CONNECTION_TYPE); } static void bxe_storm_memset_hc_timeout(struct bxe_softc *sc, uint8_t port, uint8_t fw_sb_id, uint8_t sb_index, uint8_t ticks) { uint32_t addr = (BAR_CSTRORM_INTMEM + CSTORM_STATUS_BLOCK_DATA_TIMEOUT_OFFSET(fw_sb_id, sb_index)); REG_WR8(sc, addr, ticks); BLOGD(sc, DBG_LOAD, "port %d fw_sb_id %d sb_index %d ticks %d\n", port, fw_sb_id, sb_index, ticks); } static void bxe_storm_memset_hc_disable(struct bxe_softc *sc, uint8_t port, uint16_t fw_sb_id, uint8_t sb_index, uint8_t disable) { uint32_t enable_flag = (disable) ? 0 : (1 << HC_INDEX_DATA_HC_ENABLED_SHIFT); uint32_t addr = (BAR_CSTRORM_INTMEM + CSTORM_STATUS_BLOCK_DATA_FLAGS_OFFSET(fw_sb_id, sb_index)); uint8_t flags; /* clear and set */ flags = REG_RD8(sc, addr); flags &= ~HC_INDEX_DATA_HC_ENABLED; flags |= enable_flag; REG_WR8(sc, addr, flags); BLOGD(sc, DBG_LOAD, "port %d fw_sb_id %d sb_index %d disable %d\n", port, fw_sb_id, sb_index, disable); } void bxe_update_coalesce_sb_index(struct bxe_softc *sc, uint8_t fw_sb_id, uint8_t sb_index, uint8_t disable, uint16_t usec) { int port = SC_PORT(sc); uint8_t ticks = (usec / 4); /* XXX ??? */ bxe_storm_memset_hc_timeout(sc, port, fw_sb_id, sb_index, ticks); disable = (disable) ? 1 : ((usec) ? 0 : 1); bxe_storm_memset_hc_disable(sc, port, fw_sb_id, sb_index, disable); } void elink_cb_udelay(struct bxe_softc *sc, uint32_t usecs) { DELAY(usecs); } uint32_t elink_cb_reg_read(struct bxe_softc *sc, uint32_t reg_addr) { return (REG_RD(sc, reg_addr)); } void elink_cb_reg_write(struct bxe_softc *sc, uint32_t reg_addr, uint32_t val) { REG_WR(sc, reg_addr, val); } void elink_cb_reg_wb_write(struct bxe_softc *sc, uint32_t offset, uint32_t *wb_write, uint16_t len) { REG_WR_DMAE(sc, offset, wb_write, len); } void elink_cb_reg_wb_read(struct bxe_softc *sc, uint32_t offset, uint32_t *wb_write, uint16_t len) { REG_RD_DMAE(sc, offset, wb_write, len); } uint8_t elink_cb_path_id(struct bxe_softc *sc) { return (SC_PATH(sc)); } void elink_cb_event_log(struct bxe_softc *sc, const elink_log_id_t elink_log_id, ...) { /* XXX */ #if 0 //va_list ap; va_start(ap, elink_log_id); _XXX_(sc, lm_log_id, ap); va_end(ap); #endif BLOGI(sc, "ELINK EVENT LOG (%d)\n", elink_log_id); } static int bxe_set_spio(struct bxe_softc *sc, int spio, uint32_t mode) { uint32_t spio_reg; /* Only 2 SPIOs are configurable */ if ((spio != MISC_SPIO_SPIO4) && (spio != MISC_SPIO_SPIO5)) { BLOGE(sc, "Invalid SPIO 0x%x\n", spio); return (-1); } bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_SPIO); /* read SPIO and mask except the float bits */ spio_reg = (REG_RD(sc, MISC_REG_SPIO) & MISC_SPIO_FLOAT); switch (mode) { case MISC_SPIO_OUTPUT_LOW: BLOGD(sc, DBG_LOAD, "Set SPIO 0x%x -> output low\n", spio); /* clear FLOAT and set CLR */ spio_reg &= ~(spio << MISC_SPIO_FLOAT_POS); spio_reg |= (spio << MISC_SPIO_CLR_POS); break; case MISC_SPIO_OUTPUT_HIGH: BLOGD(sc, DBG_LOAD, "Set SPIO 0x%x -> output high\n", spio); /* clear FLOAT and set SET */ spio_reg &= ~(spio << MISC_SPIO_FLOAT_POS); spio_reg |= (spio << MISC_SPIO_SET_POS); break; case MISC_SPIO_INPUT_HI_Z: BLOGD(sc, DBG_LOAD, "Set SPIO 0x%x -> input\n", spio); /* set FLOAT */ spio_reg |= (spio << MISC_SPIO_FLOAT_POS); break; default: break; } REG_WR(sc, MISC_REG_SPIO, spio_reg); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_SPIO); return (0); } static int bxe_gpio_read(struct bxe_softc *sc, int gpio_num, uint8_t port) { /* The GPIO should be swapped if swap register is set and active */ int gpio_port = ((REG_RD(sc, NIG_REG_PORT_SWAP) && REG_RD(sc, NIG_REG_STRAP_OVERRIDE)) ^ port); int gpio_shift = (gpio_num + (gpio_port ? MISC_REGISTERS_GPIO_PORT_SHIFT : 0)); uint32_t gpio_mask = (1 << gpio_shift); uint32_t gpio_reg; if (gpio_num > MISC_REGISTERS_GPIO_3) { BLOGE(sc, "Invalid GPIO %d\n", gpio_num); return (-1); } /* read GPIO value */ gpio_reg = REG_RD(sc, MISC_REG_GPIO); /* get the requested pin value */ return ((gpio_reg & gpio_mask) == gpio_mask) ? 1 : 0; } static int bxe_gpio_write(struct bxe_softc *sc, int gpio_num, uint32_t mode, uint8_t port) { /* The GPIO should be swapped if swap register is set and active */ int gpio_port = ((REG_RD(sc, NIG_REG_PORT_SWAP) && REG_RD(sc, NIG_REG_STRAP_OVERRIDE)) ^ port); int gpio_shift = (gpio_num + (gpio_port ? MISC_REGISTERS_GPIO_PORT_SHIFT : 0)); uint32_t gpio_mask = (1 << gpio_shift); uint32_t gpio_reg; if (gpio_num > MISC_REGISTERS_GPIO_3) { BLOGE(sc, "Invalid GPIO %d\n", gpio_num); return (-1); } bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_GPIO); /* read GPIO and mask except the float bits */ gpio_reg = (REG_RD(sc, MISC_REG_GPIO) & MISC_REGISTERS_GPIO_FLOAT); switch (mode) { case MISC_REGISTERS_GPIO_OUTPUT_LOW: BLOGD(sc, DBG_PHY, "Set GPIO %d (shift %d) -> output low\n", gpio_num, gpio_shift); /* clear FLOAT and set CLR */ gpio_reg &= ~(gpio_mask << MISC_REGISTERS_GPIO_FLOAT_POS); gpio_reg |= (gpio_mask << MISC_REGISTERS_GPIO_CLR_POS); break; case MISC_REGISTERS_GPIO_OUTPUT_HIGH: BLOGD(sc, DBG_PHY, "Set GPIO %d (shift %d) -> output high\n", gpio_num, gpio_shift); /* clear FLOAT and set SET */ gpio_reg &= ~(gpio_mask << MISC_REGISTERS_GPIO_FLOAT_POS); gpio_reg |= (gpio_mask << MISC_REGISTERS_GPIO_SET_POS); break; case MISC_REGISTERS_GPIO_INPUT_HI_Z: BLOGD(sc, DBG_PHY, "Set GPIO %d (shift %d) -> input\n", gpio_num, gpio_shift); /* set FLOAT */ gpio_reg |= (gpio_mask << MISC_REGISTERS_GPIO_FLOAT_POS); break; default: break; } REG_WR(sc, MISC_REG_GPIO, gpio_reg); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_GPIO); return (0); } static int bxe_gpio_mult_write(struct bxe_softc *sc, uint8_t pins, uint32_t mode) { uint32_t gpio_reg; /* any port swapping should be handled by caller */ bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_GPIO); /* read GPIO and mask except the float bits */ gpio_reg = REG_RD(sc, MISC_REG_GPIO); gpio_reg &= ~(pins << MISC_REGISTERS_GPIO_FLOAT_POS); gpio_reg &= ~(pins << MISC_REGISTERS_GPIO_CLR_POS); gpio_reg &= ~(pins << MISC_REGISTERS_GPIO_SET_POS); switch (mode) { case MISC_REGISTERS_GPIO_OUTPUT_LOW: BLOGD(sc, DBG_PHY, "Set GPIO 0x%x -> output low\n", pins); /* set CLR */ gpio_reg |= (pins << MISC_REGISTERS_GPIO_CLR_POS); break; case MISC_REGISTERS_GPIO_OUTPUT_HIGH: BLOGD(sc, DBG_PHY, "Set GPIO 0x%x -> output high\n", pins); /* set SET */ gpio_reg |= (pins << MISC_REGISTERS_GPIO_SET_POS); break; case MISC_REGISTERS_GPIO_INPUT_HI_Z: BLOGD(sc, DBG_PHY, "Set GPIO 0x%x -> input\n", pins); /* set FLOAT */ gpio_reg |= (pins << MISC_REGISTERS_GPIO_FLOAT_POS); break; default: BLOGE(sc, "Invalid GPIO mode assignment %d\n", mode); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_GPIO); return (-1); } REG_WR(sc, MISC_REG_GPIO, gpio_reg); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_GPIO); return (0); } static int bxe_gpio_int_write(struct bxe_softc *sc, int gpio_num, uint32_t mode, uint8_t port) { /* The GPIO should be swapped if swap register is set and active */ int gpio_port = ((REG_RD(sc, NIG_REG_PORT_SWAP) && REG_RD(sc, NIG_REG_STRAP_OVERRIDE)) ^ port); int gpio_shift = (gpio_num + (gpio_port ? MISC_REGISTERS_GPIO_PORT_SHIFT : 0)); uint32_t gpio_mask = (1 << gpio_shift); uint32_t gpio_reg; if (gpio_num > MISC_REGISTERS_GPIO_3) { BLOGE(sc, "Invalid GPIO %d\n", gpio_num); return (-1); } bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_GPIO); /* read GPIO int */ gpio_reg = REG_RD(sc, MISC_REG_GPIO_INT); switch (mode) { case MISC_REGISTERS_GPIO_INT_OUTPUT_CLR: BLOGD(sc, DBG_PHY, "Clear GPIO INT %d (shift %d) -> output low\n", gpio_num, gpio_shift); /* clear SET and set CLR */ gpio_reg &= ~(gpio_mask << MISC_REGISTERS_GPIO_INT_SET_POS); gpio_reg |= (gpio_mask << MISC_REGISTERS_GPIO_INT_CLR_POS); break; case MISC_REGISTERS_GPIO_INT_OUTPUT_SET: BLOGD(sc, DBG_PHY, "Set GPIO INT %d (shift %d) -> output high\n", gpio_num, gpio_shift); /* clear CLR and set SET */ gpio_reg &= ~(gpio_mask << MISC_REGISTERS_GPIO_INT_CLR_POS); gpio_reg |= (gpio_mask << MISC_REGISTERS_GPIO_INT_SET_POS); break; default: break; } REG_WR(sc, MISC_REG_GPIO_INT, gpio_reg); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_GPIO); return (0); } uint32_t elink_cb_gpio_read(struct bxe_softc *sc, uint16_t gpio_num, uint8_t port) { return (bxe_gpio_read(sc, gpio_num, port)); } uint8_t elink_cb_gpio_write(struct bxe_softc *sc, uint16_t gpio_num, uint8_t mode, /* 0=low 1=high */ uint8_t port) { return (bxe_gpio_write(sc, gpio_num, mode, port)); } uint8_t elink_cb_gpio_mult_write(struct bxe_softc *sc, uint8_t pins, uint8_t mode) /* 0=low 1=high */ { return (bxe_gpio_mult_write(sc, pins, mode)); } uint8_t elink_cb_gpio_int_write(struct bxe_softc *sc, uint16_t gpio_num, uint8_t mode, /* 0=low 1=high */ uint8_t port) { return (bxe_gpio_int_write(sc, gpio_num, mode, port)); } void elink_cb_notify_link_changed(struct bxe_softc *sc) { REG_WR(sc, (MISC_REG_AEU_GENERAL_ATTN_12 + (SC_FUNC(sc) * sizeof(uint32_t))), 1); } /* send the MCP a request, block until there is a reply */ uint32_t elink_cb_fw_command(struct bxe_softc *sc, uint32_t command, uint32_t param) { int mb_idx = SC_FW_MB_IDX(sc); uint32_t seq; uint32_t rc = 0; uint32_t cnt = 1; uint8_t delay = CHIP_REV_IS_SLOW(sc) ? 100 : 10; BXE_FWMB_LOCK(sc); seq = ++sc->fw_seq; SHMEM_WR(sc, func_mb[mb_idx].drv_mb_param, param); SHMEM_WR(sc, func_mb[mb_idx].drv_mb_header, (command | seq)); BLOGD(sc, DBG_PHY, "wrote command 0x%08x to FW MB param 0x%08x\n", (command | seq), param); /* Let the FW do it's magic. GIve it up to 5 seconds... */ do { DELAY(delay * 1000); rc = SHMEM_RD(sc, func_mb[mb_idx].fw_mb_header); } while ((seq != (rc & FW_MSG_SEQ_NUMBER_MASK)) && (cnt++ < 500)); BLOGD(sc, DBG_PHY, "[after %d ms] read 0x%x seq 0x%x from FW MB\n", cnt*delay, rc, seq); /* is this a reply to our command? */ if (seq == (rc & FW_MSG_SEQ_NUMBER_MASK)) { rc &= FW_MSG_CODE_MASK; } else { /* Ruh-roh! */ BLOGE(sc, "FW failed to respond!\n"); // XXX bxe_fw_dump(sc); rc = 0; } BXE_FWMB_UNLOCK(sc); return (rc); } static uint32_t bxe_fw_command(struct bxe_softc *sc, uint32_t command, uint32_t param) { return (elink_cb_fw_command(sc, command, param)); } static void __storm_memset_dma_mapping(struct bxe_softc *sc, uint32_t addr, bus_addr_t mapping) { REG_WR(sc, addr, U64_LO(mapping)); REG_WR(sc, (addr + 4), U64_HI(mapping)); } static void storm_memset_spq_addr(struct bxe_softc *sc, bus_addr_t mapping, uint16_t abs_fid) { uint32_t addr = (XSEM_REG_FAST_MEMORY + XSTORM_SPQ_PAGE_BASE_OFFSET(abs_fid)); __storm_memset_dma_mapping(sc, addr, mapping); } static void storm_memset_vf_to_pf(struct bxe_softc *sc, uint16_t abs_fid, uint16_t pf_id) { REG_WR8(sc, (BAR_XSTRORM_INTMEM + XSTORM_VF_TO_PF_OFFSET(abs_fid)), pf_id); REG_WR8(sc, (BAR_CSTRORM_INTMEM + CSTORM_VF_TO_PF_OFFSET(abs_fid)), pf_id); REG_WR8(sc, (BAR_TSTRORM_INTMEM + TSTORM_VF_TO_PF_OFFSET(abs_fid)), pf_id); REG_WR8(sc, (BAR_USTRORM_INTMEM + USTORM_VF_TO_PF_OFFSET(abs_fid)), pf_id); } static void storm_memset_func_en(struct bxe_softc *sc, uint16_t abs_fid, uint8_t enable) { REG_WR8(sc, (BAR_XSTRORM_INTMEM + XSTORM_FUNC_EN_OFFSET(abs_fid)), enable); REG_WR8(sc, (BAR_CSTRORM_INTMEM + CSTORM_FUNC_EN_OFFSET(abs_fid)), enable); REG_WR8(sc, (BAR_TSTRORM_INTMEM + TSTORM_FUNC_EN_OFFSET(abs_fid)), enable); REG_WR8(sc, (BAR_USTRORM_INTMEM + USTORM_FUNC_EN_OFFSET(abs_fid)), enable); } static void storm_memset_eq_data(struct bxe_softc *sc, struct event_ring_data *eq_data, uint16_t pfid) { uint32_t addr; size_t size; addr = (BAR_CSTRORM_INTMEM + CSTORM_EVENT_RING_DATA_OFFSET(pfid)); size = sizeof(struct event_ring_data); ecore_storm_memset_struct(sc, addr, size, (uint32_t *)eq_data); } static void storm_memset_eq_prod(struct bxe_softc *sc, uint16_t eq_prod, uint16_t pfid) { uint32_t addr = (BAR_CSTRORM_INTMEM + CSTORM_EVENT_RING_PROD_OFFSET(pfid)); REG_WR16(sc, addr, eq_prod); } /* * Post a slowpath command. * * A slowpath command is used to propogate a configuration change through * the controller in a controlled manner, allowing each STORM processor and * other H/W blocks to phase in the change. The commands sent on the * slowpath are referred to as ramrods. Depending on the ramrod used the * completion of the ramrod will occur in different ways. Here's a * breakdown of ramrods and how they complete: * * RAMROD_CMD_ID_ETH_PORT_SETUP * Used to setup the leading connection on a port. Completes on the * Receive Completion Queue (RCQ) of that port (typically fp[0]). * * RAMROD_CMD_ID_ETH_CLIENT_SETUP * Used to setup an additional connection on a port. Completes on the * RCQ of the multi-queue/RSS connection being initialized. * * RAMROD_CMD_ID_ETH_STAT_QUERY * Used to force the storm processors to update the statistics database * in host memory. This ramrod is send on the leading connection CID and * completes as an index increment of the CSTORM on the default status * block. * * RAMROD_CMD_ID_ETH_UPDATE * Used to update the state of the leading connection, usually to udpate * the RSS indirection table. Completes on the RCQ of the leading * connection. (Not currently used under FreeBSD until OS support becomes * available.) * * RAMROD_CMD_ID_ETH_HALT * Used when tearing down a connection prior to driver unload. Completes * on the RCQ of the multi-queue/RSS connection being torn down. Don't * use this on the leading connection. * * RAMROD_CMD_ID_ETH_SET_MAC * Sets the Unicast/Broadcast/Multicast used by the port. Completes on * the RCQ of the leading connection. * * RAMROD_CMD_ID_ETH_CFC_DEL * Used when tearing down a conneciton prior to driver unload. Completes * on the RCQ of the leading connection (since the current connection * has been completely removed from controller memory). * * RAMROD_CMD_ID_ETH_PORT_DEL * Used to tear down the leading connection prior to driver unload, * typically fp[0]. Completes as an index increment of the CSTORM on the * default status block. * * RAMROD_CMD_ID_ETH_FORWARD_SETUP * Used for connection offload. Completes on the RCQ of the multi-queue * RSS connection that is being offloaded. (Not currently used under * FreeBSD.) * * There can only be one command pending per function. * * Returns: * 0 = Success, !0 = Failure. */ /* must be called under the spq lock */ static inline struct eth_spe *bxe_sp_get_next(struct bxe_softc *sc) { struct eth_spe *next_spe = sc->spq_prod_bd; if (sc->spq_prod_bd == sc->spq_last_bd) { /* wrap back to the first eth_spq */ sc->spq_prod_bd = sc->spq; sc->spq_prod_idx = 0; } else { sc->spq_prod_bd++; sc->spq_prod_idx++; } return (next_spe); } /* must be called under the spq lock */ static inline void bxe_sp_prod_update(struct bxe_softc *sc) { int func = SC_FUNC(sc); /* * Make sure that BD data is updated before writing the producer. * BD data is written to the memory, the producer is read from the * memory, thus we need a full memory barrier to ensure the ordering. */ mb(); REG_WR16(sc, (BAR_XSTRORM_INTMEM + XSTORM_SPQ_PROD_OFFSET(func)), sc->spq_prod_idx); bus_space_barrier(sc->bar[BAR0].tag, sc->bar[BAR0].handle, 0, 0, BUS_SPACE_BARRIER_WRITE); } /** * bxe_is_contextless_ramrod - check if the current command ends on EQ * * @cmd: command to check * @cmd_type: command type */ static inline int bxe_is_contextless_ramrod(int cmd, int cmd_type) { if ((cmd_type == NONE_CONNECTION_TYPE) || (cmd == RAMROD_CMD_ID_ETH_FORWARD_SETUP) || (cmd == RAMROD_CMD_ID_ETH_CLASSIFICATION_RULES) || (cmd == RAMROD_CMD_ID_ETH_FILTER_RULES) || (cmd == RAMROD_CMD_ID_ETH_MULTICAST_RULES) || (cmd == RAMROD_CMD_ID_ETH_SET_MAC) || (cmd == RAMROD_CMD_ID_ETH_RSS_UPDATE)) { return (TRUE); } else { return (FALSE); } } /** * bxe_sp_post - place a single command on an SP ring * * @sc: driver handle * @command: command to place (e.g. SETUP, FILTER_RULES, etc.) * @cid: SW CID the command is related to * @data_hi: command private data address (high 32 bits) * @data_lo: command private data address (low 32 bits) * @cmd_type: command type (e.g. NONE, ETH) * * SP data is handled as if it's always an address pair, thus data fields are * not swapped to little endian in upper functions. Instead this function swaps * data as if it's two uint32 fields. */ int bxe_sp_post(struct bxe_softc *sc, int command, int cid, uint32_t data_hi, uint32_t data_lo, int cmd_type) { struct eth_spe *spe; uint16_t type; int common; common = bxe_is_contextless_ramrod(command, cmd_type); BXE_SP_LOCK(sc); if (common) { if (!atomic_load_acq_long(&sc->eq_spq_left)) { BLOGE(sc, "EQ ring is full!\n"); BXE_SP_UNLOCK(sc); return (-1); } } else { if (!atomic_load_acq_long(&sc->cq_spq_left)) { BLOGE(sc, "SPQ ring is full!\n"); BXE_SP_UNLOCK(sc); return (-1); } } spe = bxe_sp_get_next(sc); /* CID needs port number to be encoded int it */ spe->hdr.conn_and_cmd_data = htole32((command << SPE_HDR_CMD_ID_SHIFT) | HW_CID(sc, cid)); type = (cmd_type << SPE_HDR_CONN_TYPE_SHIFT) & SPE_HDR_CONN_TYPE; /* TBD: Check if it works for VFs */ type |= ((SC_FUNC(sc) << SPE_HDR_FUNCTION_ID_SHIFT) & SPE_HDR_FUNCTION_ID); spe->hdr.type = htole16(type); spe->data.update_data_addr.hi = htole32(data_hi); spe->data.update_data_addr.lo = htole32(data_lo); /* * It's ok if the actual decrement is issued towards the memory * somewhere between the lock and unlock. Thus no more explict * memory barrier is needed. */ if (common) { atomic_subtract_acq_long(&sc->eq_spq_left, 1); } else { atomic_subtract_acq_long(&sc->cq_spq_left, 1); } BLOGD(sc, DBG_SP, "SPQE -> %#jx\n", (uintmax_t)sc->spq_dma.paddr); BLOGD(sc, DBG_SP, "FUNC_RDATA -> %p / %#jx\n", BXE_SP(sc, func_rdata), (uintmax_t)BXE_SP_MAPPING(sc, func_rdata)); BLOGD(sc, DBG_SP, "SPQE[%x] (%x:%x) (cmd, common?) (%d,%d) hw_cid %x data (%x:%x) type(0x%x) left (CQ, EQ) (%lx,%lx)\n", sc->spq_prod_idx, (uint32_t)U64_HI(sc->spq_dma.paddr), (uint32_t)(U64_LO(sc->spq_dma.paddr) + (uint8_t *)sc->spq_prod_bd - (uint8_t *)sc->spq), command, common, HW_CID(sc, cid), data_hi, data_lo, type, atomic_load_acq_long(&sc->cq_spq_left), atomic_load_acq_long(&sc->eq_spq_left)); bxe_sp_prod_update(sc); BXE_SP_UNLOCK(sc); return (0); } /** * bxe_debug_print_ind_table - prints the indirection table configuration. * * @sc: driver hanlde * @p: pointer to rss configuration */ #if 0 static void bxe_debug_print_ind_table(struct bxe_softc *sc, struct ecore_config_rss_params *p) { int i; BLOGD(sc, DBG_LOAD, "Setting indirection table to:\n"); BLOGD(sc, DBG_LOAD, " 0x0000: "); for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++) { BLOGD(sc, DBG_LOAD, "0x%02x ", p->ind_table[i]); /* Print 4 bytes in a line */ if ((i + 1 < T_ETH_INDIRECTION_TABLE_SIZE) && (((i + 1) & 0x3) == 0)) { BLOGD(sc, DBG_LOAD, "\n"); BLOGD(sc, DBG_LOAD, "0x%04x: ", i + 1); } } BLOGD(sc, DBG_LOAD, "\n"); } #endif /* * FreeBSD Device probe function. * * Compares the device found to the driver's list of supported devices and * reports back to the bsd loader whether this is the right driver for the device. * This is the driver entry function called from the "kldload" command. * * Returns: * BUS_PROBE_DEFAULT on success, positive value on failure. */ static int bxe_probe(device_t dev) { struct bxe_softc *sc; struct bxe_device_type *t; char *descbuf; uint16_t did, sdid, svid, vid; /* Find our device structure */ sc = device_get_softc(dev); sc->dev = dev; t = bxe_devs; /* Get the data for the device to be probed. */ vid = pci_get_vendor(dev); did = pci_get_device(dev); svid = pci_get_subvendor(dev); sdid = pci_get_subdevice(dev); BLOGD(sc, DBG_LOAD, "%s(); VID = 0x%04X, DID = 0x%04X, SVID = 0x%04X, " "SDID = 0x%04X\n", __FUNCTION__, vid, did, svid, sdid); /* Look through the list of known devices for a match. */ while (t->bxe_name != NULL) { if ((vid == t->bxe_vid) && (did == t->bxe_did) && ((svid == t->bxe_svid) || (t->bxe_svid == PCI_ANY_ID)) && ((sdid == t->bxe_sdid) || (t->bxe_sdid == PCI_ANY_ID))) { descbuf = malloc(BXE_DEVDESC_MAX, M_TEMP, M_NOWAIT); if (descbuf == NULL) return (ENOMEM); /* Print out the device identity. */ snprintf(descbuf, BXE_DEVDESC_MAX, "%s (%c%d) BXE v:%s\n", t->bxe_name, (((pci_read_config(dev, PCIR_REVID, 4) & 0xf0) >> 4) + 'A'), (pci_read_config(dev, PCIR_REVID, 4) & 0xf), BXE_DRIVER_VERSION); device_set_desc_copy(dev, descbuf); free(descbuf, M_TEMP); return (BUS_PROBE_DEFAULT); } t++; } return (ENXIO); } static void bxe_init_mutexes(struct bxe_softc *sc) { #ifdef BXE_CORE_LOCK_SX snprintf(sc->core_sx_name, sizeof(sc->core_sx_name), "bxe%d_core_lock", sc->unit); sx_init(&sc->core_sx, sc->core_sx_name); #else snprintf(sc->core_mtx_name, sizeof(sc->core_mtx_name), "bxe%d_core_lock", sc->unit); mtx_init(&sc->core_mtx, sc->core_mtx_name, NULL, MTX_DEF); #endif snprintf(sc->sp_mtx_name, sizeof(sc->sp_mtx_name), "bxe%d_sp_lock", sc->unit); mtx_init(&sc->sp_mtx, sc->sp_mtx_name, NULL, MTX_DEF); snprintf(sc->dmae_mtx_name, sizeof(sc->dmae_mtx_name), "bxe%d_dmae_lock", sc->unit); mtx_init(&sc->dmae_mtx, sc->dmae_mtx_name, NULL, MTX_DEF); snprintf(sc->port.phy_mtx_name, sizeof(sc->port.phy_mtx_name), "bxe%d_phy_lock", sc->unit); mtx_init(&sc->port.phy_mtx, sc->port.phy_mtx_name, NULL, MTX_DEF); snprintf(sc->fwmb_mtx_name, sizeof(sc->fwmb_mtx_name), "bxe%d_fwmb_lock", sc->unit); mtx_init(&sc->fwmb_mtx, sc->fwmb_mtx_name, NULL, MTX_DEF); snprintf(sc->print_mtx_name, sizeof(sc->print_mtx_name), "bxe%d_print_lock", sc->unit); mtx_init(&(sc->print_mtx), sc->print_mtx_name, NULL, MTX_DEF); snprintf(sc->stats_mtx_name, sizeof(sc->stats_mtx_name), "bxe%d_stats_lock", sc->unit); mtx_init(&(sc->stats_mtx), sc->stats_mtx_name, NULL, MTX_DEF); snprintf(sc->mcast_mtx_name, sizeof(sc->mcast_mtx_name), "bxe%d_mcast_lock", sc->unit); mtx_init(&(sc->mcast_mtx), sc->mcast_mtx_name, NULL, MTX_DEF); } static void bxe_release_mutexes(struct bxe_softc *sc) { #ifdef BXE_CORE_LOCK_SX sx_destroy(&sc->core_sx); #else if (mtx_initialized(&sc->core_mtx)) { mtx_destroy(&sc->core_mtx); } #endif if (mtx_initialized(&sc->sp_mtx)) { mtx_destroy(&sc->sp_mtx); } if (mtx_initialized(&sc->dmae_mtx)) { mtx_destroy(&sc->dmae_mtx); } if (mtx_initialized(&sc->port.phy_mtx)) { mtx_destroy(&sc->port.phy_mtx); } if (mtx_initialized(&sc->fwmb_mtx)) { mtx_destroy(&sc->fwmb_mtx); } if (mtx_initialized(&sc->print_mtx)) { mtx_destroy(&sc->print_mtx); } if (mtx_initialized(&sc->stats_mtx)) { mtx_destroy(&sc->stats_mtx); } if (mtx_initialized(&sc->mcast_mtx)) { mtx_destroy(&sc->mcast_mtx); } } static void bxe_tx_disable(struct bxe_softc* sc) { if_t ifp = sc->ifp; /* tell the stack the driver is stopped and TX queue is full */ if (ifp != NULL) { if_setdrvflags(ifp, 0); } } static void bxe_drv_pulse(struct bxe_softc *sc) { SHMEM_WR(sc, func_mb[SC_FW_MB_IDX(sc)].drv_pulse_mb, sc->fw_drv_pulse_wr_seq); } static inline uint16_t bxe_tx_avail(struct bxe_softc *sc, struct bxe_fastpath *fp) { int16_t used; uint16_t prod; uint16_t cons; prod = fp->tx_bd_prod; cons = fp->tx_bd_cons; used = SUB_S16(prod, cons); #if 0 KASSERT((used < 0), ("used tx bds < 0")); KASSERT((used > sc->tx_ring_size), ("used tx bds > tx_ring_size")); KASSERT(((sc->tx_ring_size - used) > MAX_TX_AVAIL), ("invalid number of tx bds used")); #endif return (int16_t)(sc->tx_ring_size) - used; } static inline int bxe_tx_queue_has_work(struct bxe_fastpath *fp) { uint16_t hw_cons; mb(); /* status block fields can change */ hw_cons = le16toh(*fp->tx_cons_sb); return (hw_cons != fp->tx_pkt_cons); } static inline uint8_t bxe_has_tx_work(struct bxe_fastpath *fp) { /* expand this for multi-cos if ever supported */ return (bxe_tx_queue_has_work(fp)) ? TRUE : FALSE; } static inline int bxe_has_rx_work(struct bxe_fastpath *fp) { uint16_t rx_cq_cons_sb; mb(); /* status block fields can change */ rx_cq_cons_sb = le16toh(*fp->rx_cq_cons_sb); if ((rx_cq_cons_sb & RCQ_MAX) == RCQ_MAX) rx_cq_cons_sb++; return (fp->rx_cq_cons != rx_cq_cons_sb); } static void bxe_sp_event(struct bxe_softc *sc, struct bxe_fastpath *fp, union eth_rx_cqe *rr_cqe) { int cid = SW_CID(rr_cqe->ramrod_cqe.conn_and_cmd_data); int command = CQE_CMD(rr_cqe->ramrod_cqe.conn_and_cmd_data); enum ecore_queue_cmd drv_cmd = ECORE_Q_CMD_MAX; struct ecore_queue_sp_obj *q_obj = &BXE_SP_OBJ(sc, fp).q_obj; BLOGD(sc, DBG_SP, "fp=%d cid=%d got ramrod #%d state is %x type is %d\n", fp->index, cid, command, sc->state, rr_cqe->ramrod_cqe.ramrod_type); #if 0 /* * If cid is within VF range, replace the slowpath object with the * one corresponding to this VF */ if ((cid >= BXE_FIRST_VF_CID) && (cid < BXE_FIRST_VF_CID + BXE_VF_CIDS)) { bxe_iov_set_queue_sp_obj(sc, cid, &q_obj); } #endif switch (command) { case (RAMROD_CMD_ID_ETH_CLIENT_UPDATE): BLOGD(sc, DBG_SP, "got UPDATE ramrod. CID %d\n", cid); drv_cmd = ECORE_Q_CMD_UPDATE; break; case (RAMROD_CMD_ID_ETH_CLIENT_SETUP): BLOGD(sc, DBG_SP, "got MULTI[%d] setup ramrod\n", cid); drv_cmd = ECORE_Q_CMD_SETUP; break; case (RAMROD_CMD_ID_ETH_TX_QUEUE_SETUP): BLOGD(sc, DBG_SP, "got MULTI[%d] tx-only setup ramrod\n", cid); drv_cmd = ECORE_Q_CMD_SETUP_TX_ONLY; break; case (RAMROD_CMD_ID_ETH_HALT): BLOGD(sc, DBG_SP, "got MULTI[%d] halt ramrod\n", cid); drv_cmd = ECORE_Q_CMD_HALT; break; case (RAMROD_CMD_ID_ETH_TERMINATE): BLOGD(sc, DBG_SP, "got MULTI[%d] teminate ramrod\n", cid); drv_cmd = ECORE_Q_CMD_TERMINATE; break; case (RAMROD_CMD_ID_ETH_EMPTY): BLOGD(sc, DBG_SP, "got MULTI[%d] empty ramrod\n", cid); drv_cmd = ECORE_Q_CMD_EMPTY; break; default: BLOGD(sc, DBG_SP, "ERROR: unexpected MC reply (%d) on fp[%d]\n", command, fp->index); return; } if ((drv_cmd != ECORE_Q_CMD_MAX) && q_obj->complete_cmd(sc, q_obj, drv_cmd)) { /* * q_obj->complete_cmd() failure means that this was * an unexpected completion. * * In this case we don't want to increase the sc->spq_left * because apparently we haven't sent this command the first * place. */ // bxe_panic(sc, ("Unexpected SP completion\n")); return; } #if 0 /* SRIOV: reschedule any 'in_progress' operations */ bxe_iov_sp_event(sc, cid, TRUE); #endif atomic_add_acq_long(&sc->cq_spq_left, 1); BLOGD(sc, DBG_SP, "sc->cq_spq_left 0x%lx\n", atomic_load_acq_long(&sc->cq_spq_left)); #if 0 if ((drv_cmd == ECORE_Q_CMD_UPDATE) && (IS_FCOE_FP(fp)) && (!!bxe_test_bit(ECORE_AFEX_FCOE_Q_UPDATE_PENDING, &sc->sp_state))) { /* * If Queue update ramrod is completed for last Queue in AFEX VIF set * flow, then ACK MCP at the end. Mark pending ACK to MCP bit to * prevent case that both bits are cleared. At the end of load/unload * driver checks that sp_state is cleared and this order prevents * races. */ bxe_set_bit(ECORE_AFEX_PENDING_VIFSET_MCP_ACK, &sc->sp_state); wmb(); bxe_clear_bit(ECORE_AFEX_FCOE_Q_UPDATE_PENDING, &sc->sp_state); /* schedule the sp task as MCP ack is required */ bxe_schedule_sp_task(sc); } #endif } /* * The current mbuf is part of an aggregation. Move the mbuf into the TPA * aggregation queue, put an empty mbuf back onto the receive chain, and mark * the current aggregation queue as in-progress. */ static void bxe_tpa_start(struct bxe_softc *sc, struct bxe_fastpath *fp, uint16_t queue, uint16_t cons, uint16_t prod, struct eth_fast_path_rx_cqe *cqe) { struct bxe_sw_rx_bd tmp_bd; struct bxe_sw_rx_bd *rx_buf; struct eth_rx_bd *rx_bd; int max_agg_queues; struct bxe_sw_tpa_info *tpa_info = &fp->rx_tpa_info[queue]; uint16_t index; BLOGD(sc, DBG_LRO, "fp[%02d].tpa[%02d] TPA START " "cons=%d prod=%d\n", fp->index, queue, cons, prod); max_agg_queues = MAX_AGG_QS(sc); KASSERT((queue < max_agg_queues), ("fp[%02d] invalid aggr queue (%d >= %d)!", fp->index, queue, max_agg_queues)); KASSERT((tpa_info->state == BXE_TPA_STATE_STOP), ("fp[%02d].tpa[%02d] starting aggr on queue not stopped!", fp->index, queue)); /* copy the existing mbuf and mapping from the TPA pool */ tmp_bd = tpa_info->bd; if (tmp_bd.m == NULL) { BLOGE(sc, "fp[%02d].tpa[%02d] mbuf not allocated!\n", fp->index, queue); /* XXX Error handling? */ return; } /* change the TPA queue to the start state */ tpa_info->state = BXE_TPA_STATE_START; tpa_info->placement_offset = cqe->placement_offset; tpa_info->parsing_flags = le16toh(cqe->pars_flags.flags); tpa_info->vlan_tag = le16toh(cqe->vlan_tag); tpa_info->len_on_bd = le16toh(cqe->len_on_bd); fp->rx_tpa_queue_used |= (1 << queue); /* * If all the buffer descriptors are filled with mbufs then fill in * the current consumer index with a new BD. Else if a maximum Rx * buffer limit is imposed then fill in the next producer index. */ index = (sc->max_rx_bufs != RX_BD_USABLE) ? prod : cons; /* move the received mbuf and mapping to TPA pool */ tpa_info->bd = fp->rx_mbuf_chain[cons]; /* release any existing RX BD mbuf mappings */ if (cons != index) { rx_buf = &fp->rx_mbuf_chain[cons]; if (rx_buf->m_map != NULL) { bus_dmamap_sync(fp->rx_mbuf_tag, rx_buf->m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_mbuf_tag, rx_buf->m_map); } /* * We get here when the maximum number of rx buffers is less than * RX_BD_USABLE. The mbuf is already saved above so it's OK to NULL * it out here without concern of a memory leak. */ fp->rx_mbuf_chain[cons].m = NULL; } /* update the Rx SW BD with the mbuf info from the TPA pool */ fp->rx_mbuf_chain[index] = tmp_bd; /* update the Rx BD with the empty mbuf phys address from the TPA pool */ rx_bd = &fp->rx_chain[index]; rx_bd->addr_hi = htole32(U64_HI(tpa_info->seg.ds_addr)); rx_bd->addr_lo = htole32(U64_LO(tpa_info->seg.ds_addr)); } /* * When a TPA aggregation is completed, loop through the individual mbufs * of the aggregation, combining them into a single mbuf which will be sent * up the stack. Refill all freed SGEs with mbufs as we go along. */ static int bxe_fill_frag_mbuf(struct bxe_softc *sc, struct bxe_fastpath *fp, struct bxe_sw_tpa_info *tpa_info, uint16_t queue, uint16_t pages, struct mbuf *m, struct eth_end_agg_rx_cqe *cqe, uint16_t cqe_idx) { struct mbuf *m_frag; uint32_t frag_len, frag_size, i; uint16_t sge_idx; int rc = 0; int j; frag_size = le16toh(cqe->pkt_len) - tpa_info->len_on_bd; BLOGD(sc, DBG_LRO, "fp[%02d].tpa[%02d] TPA fill len_on_bd=%d frag_size=%d pages=%d\n", fp->index, queue, tpa_info->len_on_bd, frag_size, pages); /* make sure the aggregated frame is not too big to handle */ if (pages > 8 * PAGES_PER_SGE) { BLOGE(sc, "fp[%02d].sge[0x%04x] has too many pages (%d)! " "pkt_len=%d len_on_bd=%d frag_size=%d\n", fp->index, cqe_idx, pages, le16toh(cqe->pkt_len), tpa_info->len_on_bd, frag_size); bxe_panic(sc, ("sge page count error\n")); return (EINVAL); } /* * Scan through the scatter gather list pulling individual mbufs into a * single mbuf for the host stack. */ for (i = 0, j = 0; i < pages; i += PAGES_PER_SGE, j++) { sge_idx = RX_SGE(le16toh(cqe->sgl_or_raw_data.sgl[j])); /* * Firmware gives the indices of the SGE as if the ring is an array * (meaning that the "next" element will consume 2 indices). */ frag_len = min(frag_size, (uint32_t)(SGE_PAGES)); BLOGD(sc, DBG_LRO, "fp[%02d].tpa[%02d] TPA fill i=%d j=%d " "sge_idx=%d frag_size=%d frag_len=%d\n", fp->index, queue, i, j, sge_idx, frag_size, frag_len); m_frag = fp->rx_sge_mbuf_chain[sge_idx].m; /* allocate a new mbuf for the SGE */ rc = bxe_alloc_rx_sge_mbuf(fp, sge_idx); if (rc) { /* Leave all remaining SGEs in the ring! */ return (rc); } /* update the fragment length */ m_frag->m_len = frag_len; /* concatenate the fragment to the head mbuf */ m_cat(m, m_frag); fp->eth_q_stats.mbuf_alloc_sge--; /* update the TPA mbuf size and remaining fragment size */ m->m_pkthdr.len += frag_len; frag_size -= frag_len; } BLOGD(sc, DBG_LRO, "fp[%02d].tpa[%02d] TPA fill done frag_size=%d\n", fp->index, queue, frag_size); return (rc); } static inline void bxe_clear_sge_mask_next_elems(struct bxe_fastpath *fp) { int i, j; for (i = 1; i <= RX_SGE_NUM_PAGES; i++) { int idx = RX_SGE_TOTAL_PER_PAGE * i - 1; for (j = 0; j < 2; j++) { BIT_VEC64_CLEAR_BIT(fp->sge_mask, idx); idx--; } } } static inline void bxe_init_sge_ring_bit_mask(struct bxe_fastpath *fp) { /* set the mask to all 1's, it's faster to compare to 0 than to 0xf's */ memset(fp->sge_mask, 0xff, sizeof(fp->sge_mask)); /* * Clear the two last indices in the page to 1. These are the indices that * correspond to the "next" element, hence will never be indicated and * should be removed from the calculations. */ bxe_clear_sge_mask_next_elems(fp); } static inline void bxe_update_last_max_sge(struct bxe_fastpath *fp, uint16_t idx) { uint16_t last_max = fp->last_max_sge; if (SUB_S16(idx, last_max) > 0) { fp->last_max_sge = idx; } } static inline void bxe_update_sge_prod(struct bxe_softc *sc, struct bxe_fastpath *fp, uint16_t sge_len, struct eth_end_agg_rx_cqe *cqe) { uint16_t last_max, last_elem, first_elem; uint16_t delta = 0; uint16_t i; if (!sge_len) { return; } /* first mark all used pages */ for (i = 0; i < sge_len; i++) { BIT_VEC64_CLEAR_BIT(fp->sge_mask, RX_SGE(le16toh(cqe->sgl_or_raw_data.sgl[i]))); } BLOGD(sc, DBG_LRO, "fp[%02d] fp_cqe->sgl[%d] = %d\n", fp->index, sge_len - 1, le16toh(cqe->sgl_or_raw_data.sgl[sge_len - 1])); /* assume that the last SGE index is the biggest */ bxe_update_last_max_sge(fp, le16toh(cqe->sgl_or_raw_data.sgl[sge_len - 1])); last_max = RX_SGE(fp->last_max_sge); last_elem = last_max >> BIT_VEC64_ELEM_SHIFT; first_elem = RX_SGE(fp->rx_sge_prod) >> BIT_VEC64_ELEM_SHIFT; /* if ring is not full */ if (last_elem + 1 != first_elem) { last_elem++; } /* now update the prod */ for (i = first_elem; i != last_elem; i = RX_SGE_NEXT_MASK_ELEM(i)) { if (__predict_true(fp->sge_mask[i])) { break; } fp->sge_mask[i] = BIT_VEC64_ELEM_ONE_MASK; delta += BIT_VEC64_ELEM_SZ; } if (delta > 0) { fp->rx_sge_prod += delta; /* clear page-end entries */ bxe_clear_sge_mask_next_elems(fp); } BLOGD(sc, DBG_LRO, "fp[%02d] fp->last_max_sge=%d fp->rx_sge_prod=%d\n", fp->index, fp->last_max_sge, fp->rx_sge_prod); } /* * The aggregation on the current TPA queue has completed. Pull the individual * mbuf fragments together into a single mbuf, perform all necessary checksum * calculations, and send the resuting mbuf to the stack. */ static void bxe_tpa_stop(struct bxe_softc *sc, struct bxe_fastpath *fp, struct bxe_sw_tpa_info *tpa_info, uint16_t queue, uint16_t pages, struct eth_end_agg_rx_cqe *cqe, uint16_t cqe_idx) { if_t ifp = sc->ifp; struct mbuf *m; int rc = 0; BLOGD(sc, DBG_LRO, "fp[%02d].tpa[%02d] pad=%d pkt_len=%d pages=%d vlan=%d\n", fp->index, queue, tpa_info->placement_offset, le16toh(cqe->pkt_len), pages, tpa_info->vlan_tag); m = tpa_info->bd.m; /* allocate a replacement before modifying existing mbuf */ rc = bxe_alloc_rx_tpa_mbuf(fp, queue); if (rc) { /* drop the frame and log an error */ fp->eth_q_stats.rx_soft_errors++; goto bxe_tpa_stop_exit; } /* we have a replacement, fixup the current mbuf */ m_adj(m, tpa_info->placement_offset); m->m_pkthdr.len = m->m_len = tpa_info->len_on_bd; /* mark the checksums valid (taken care of by the firmware) */ fp->eth_q_stats.rx_ofld_frames_csum_ip++; fp->eth_q_stats.rx_ofld_frames_csum_tcp_udp++; m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); /* aggregate all of the SGEs into a single mbuf */ rc = bxe_fill_frag_mbuf(sc, fp, tpa_info, queue, pages, m, cqe, cqe_idx); if (rc) { /* drop the packet and log an error */ fp->eth_q_stats.rx_soft_errors++; m_freem(m); } else { if (tpa_info->parsing_flags & PARSING_FLAGS_VLAN) { m->m_pkthdr.ether_vtag = tpa_info->vlan_tag; m->m_flags |= M_VLANTAG; } /* assign packet to this interface interface */ if_setrcvif(m, ifp); #if __FreeBSD_version >= 800000 /* specify what RSS queue was used for this flow */ m->m_pkthdr.flowid = fp->index; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); #endif if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); fp->eth_q_stats.rx_tpa_pkts++; /* pass the frame to the stack */ if_input(ifp, m); } /* we passed an mbuf up the stack or dropped the frame */ fp->eth_q_stats.mbuf_alloc_tpa--; bxe_tpa_stop_exit: fp->rx_tpa_info[queue].state = BXE_TPA_STATE_STOP; fp->rx_tpa_queue_used &= ~(1 << queue); } static uint8_t bxe_rxeof(struct bxe_softc *sc, struct bxe_fastpath *fp) { if_t ifp = sc->ifp; uint16_t bd_cons, bd_prod, bd_prod_fw, comp_ring_cons; uint16_t hw_cq_cons, sw_cq_cons, sw_cq_prod; int rx_pkts = 0; int rc; BXE_FP_RX_LOCK(fp); /* CQ "next element" is of the size of the regular element */ hw_cq_cons = le16toh(*fp->rx_cq_cons_sb); if ((hw_cq_cons & RCQ_USABLE_PER_PAGE) == RCQ_USABLE_PER_PAGE) { hw_cq_cons++; } bd_cons = fp->rx_bd_cons; bd_prod = fp->rx_bd_prod; bd_prod_fw = bd_prod; sw_cq_cons = fp->rx_cq_cons; sw_cq_prod = fp->rx_cq_prod; /* * Memory barrier necessary as speculative reads of the rx * buffer can be ahead of the index in the status block */ rmb(); BLOGD(sc, DBG_RX, "fp[%02d] Rx START hw_cq_cons=%u sw_cq_cons=%u\n", fp->index, hw_cq_cons, sw_cq_cons); while (sw_cq_cons != hw_cq_cons) { struct bxe_sw_rx_bd *rx_buf = NULL; union eth_rx_cqe *cqe; struct eth_fast_path_rx_cqe *cqe_fp; uint8_t cqe_fp_flags; enum eth_rx_cqe_type cqe_fp_type; uint16_t len, pad; struct mbuf *m = NULL; comp_ring_cons = RCQ(sw_cq_cons); bd_prod = RX_BD(bd_prod); bd_cons = RX_BD(bd_cons); cqe = &fp->rcq_chain[comp_ring_cons]; cqe_fp = &cqe->fast_path_cqe; cqe_fp_flags = cqe_fp->type_error_flags; cqe_fp_type = cqe_fp_flags & ETH_FAST_PATH_RX_CQE_TYPE; BLOGD(sc, DBG_RX, "fp[%02d] Rx hw_cq_cons=%d hw_sw_cons=%d " "BD prod=%d cons=%d CQE type=0x%x err=0x%x " "status=0x%x rss_hash=0x%x vlan=0x%x len=%u\n", fp->index, hw_cq_cons, sw_cq_cons, bd_prod, bd_cons, CQE_TYPE(cqe_fp_flags), cqe_fp_flags, cqe_fp->status_flags, le32toh(cqe_fp->rss_hash_result), le16toh(cqe_fp->vlan_tag), le16toh(cqe_fp->pkt_len_or_gro_seg_len)); /* is this a slowpath msg? */ if (__predict_false(CQE_TYPE_SLOW(cqe_fp_type))) { bxe_sp_event(sc, fp, cqe); goto next_cqe; } rx_buf = &fp->rx_mbuf_chain[bd_cons]; if (!CQE_TYPE_FAST(cqe_fp_type)) { struct bxe_sw_tpa_info *tpa_info; uint16_t frag_size, pages; uint8_t queue; #if 0 /* sanity check */ if (!fp->tpa_enable && (CQE_TYPE_START(cqe_fp_type) || CQE_TYPE_STOP(cqe_fp_type))) { BLOGE(sc, "START/STOP packet while !tpa_enable type (0x%x)\n", CQE_TYPE(cqe_fp_type)); } #endif if (CQE_TYPE_START(cqe_fp_type)) { bxe_tpa_start(sc, fp, cqe_fp->queue_index, bd_cons, bd_prod, cqe_fp); m = NULL; /* packet not ready yet */ goto next_rx; } KASSERT(CQE_TYPE_STOP(cqe_fp_type), ("CQE type is not STOP! (0x%x)\n", cqe_fp_type)); queue = cqe->end_agg_cqe.queue_index; tpa_info = &fp->rx_tpa_info[queue]; BLOGD(sc, DBG_LRO, "fp[%02d].tpa[%02d] TPA STOP\n", fp->index, queue); frag_size = (le16toh(cqe->end_agg_cqe.pkt_len) - tpa_info->len_on_bd); pages = SGE_PAGE_ALIGN(frag_size) >> SGE_PAGE_SHIFT; bxe_tpa_stop(sc, fp, tpa_info, queue, pages, &cqe->end_agg_cqe, comp_ring_cons); bxe_update_sge_prod(sc, fp, pages, &cqe->end_agg_cqe); goto next_cqe; } /* non TPA */ /* is this an error packet? */ if (__predict_false(cqe_fp_flags & ETH_FAST_PATH_RX_CQE_PHY_DECODE_ERR_FLG)) { BLOGE(sc, "flags 0x%x rx packet %u\n", cqe_fp_flags, sw_cq_cons); fp->eth_q_stats.rx_soft_errors++; goto next_rx; } len = le16toh(cqe_fp->pkt_len_or_gro_seg_len); pad = cqe_fp->placement_offset; m = rx_buf->m; if (__predict_false(m == NULL)) { BLOGE(sc, "No mbuf in rx chain descriptor %d for fp[%02d]\n", bd_cons, fp->index); goto next_rx; } /* XXX double copy if packet length under a threshold */ /* * If all the buffer descriptors are filled with mbufs then fill in * the current consumer index with a new BD. Else if a maximum Rx * buffer limit is imposed then fill in the next producer index. */ rc = bxe_alloc_rx_bd_mbuf(fp, bd_cons, (sc->max_rx_bufs != RX_BD_USABLE) ? bd_prod : bd_cons); if (rc != 0) { BLOGE(sc, "mbuf alloc fail for fp[%02d] rx chain (%d)\n", fp->index, rc); fp->eth_q_stats.rx_soft_errors++; if (sc->max_rx_bufs != RX_BD_USABLE) { /* copy this consumer index to the producer index */ memcpy(&fp->rx_mbuf_chain[bd_prod], rx_buf, sizeof(struct bxe_sw_rx_bd)); memset(rx_buf, 0, sizeof(struct bxe_sw_rx_bd)); } goto next_rx; } /* current mbuf was detached from the bd */ fp->eth_q_stats.mbuf_alloc_rx--; /* we allocated a replacement mbuf, fixup the current one */ m_adj(m, pad); m->m_pkthdr.len = m->m_len = len; /* assign packet to this interface interface */ if_setrcvif(m, ifp); /* assume no hardware checksum has complated */ m->m_pkthdr.csum_flags = 0; /* validate checksum if offload enabled */ if (if_getcapenable(ifp) & IFCAP_RXCSUM) { /* check for a valid IP frame */ if (!(cqe->fast_path_cqe.status_flags & ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG)) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; if (__predict_false(cqe_fp_flags & ETH_FAST_PATH_RX_CQE_IP_BAD_XSUM_FLG)) { fp->eth_q_stats.rx_hw_csum_errors++; } else { fp->eth_q_stats.rx_ofld_frames_csum_ip++; m->m_pkthdr.csum_flags |= CSUM_IP_VALID; } } /* check for a valid TCP/UDP frame */ if (!(cqe->fast_path_cqe.status_flags & ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG)) { if (__predict_false(cqe_fp_flags & ETH_FAST_PATH_RX_CQE_L4_BAD_XSUM_FLG)) { fp->eth_q_stats.rx_hw_csum_errors++; } else { fp->eth_q_stats.rx_ofld_frames_csum_tcp_udp++; m->m_pkthdr.csum_data = 0xFFFF; m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); } } } /* if there is a VLAN tag then flag that info */ if (cqe->fast_path_cqe.pars_flags.flags & PARSING_FLAGS_VLAN) { m->m_pkthdr.ether_vtag = cqe->fast_path_cqe.vlan_tag; m->m_flags |= M_VLANTAG; } #if __FreeBSD_version >= 800000 /* specify what RSS queue was used for this flow */ m->m_pkthdr.flowid = fp->index; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); #endif next_rx: bd_cons = RX_BD_NEXT(bd_cons); bd_prod = RX_BD_NEXT(bd_prod); bd_prod_fw = RX_BD_NEXT(bd_prod_fw); /* pass the frame to the stack */ if (__predict_true(m != NULL)) { if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); rx_pkts++; if_input(ifp, m); } next_cqe: sw_cq_prod = RCQ_NEXT(sw_cq_prod); sw_cq_cons = RCQ_NEXT(sw_cq_cons); /* limit spinning on the queue */ if (rx_pkts == sc->rx_budget) { fp->eth_q_stats.rx_budget_reached++; break; } } /* while work to do */ fp->rx_bd_cons = bd_cons; fp->rx_bd_prod = bd_prod_fw; fp->rx_cq_cons = sw_cq_cons; fp->rx_cq_prod = sw_cq_prod; /* Update producers */ bxe_update_rx_prod(sc, fp, bd_prod_fw, sw_cq_prod, fp->rx_sge_prod); fp->eth_q_stats.rx_pkts += rx_pkts; fp->eth_q_stats.rx_calls++; BXE_FP_RX_UNLOCK(fp); return (sw_cq_cons != hw_cq_cons); } static uint16_t bxe_free_tx_pkt(struct bxe_softc *sc, struct bxe_fastpath *fp, uint16_t idx) { struct bxe_sw_tx_bd *tx_buf = &fp->tx_mbuf_chain[idx]; struct eth_tx_start_bd *tx_start_bd; uint16_t bd_idx = TX_BD(tx_buf->first_bd); uint16_t new_cons; int nbd; /* unmap the mbuf from non-paged memory */ bus_dmamap_unload(fp->tx_mbuf_tag, tx_buf->m_map); tx_start_bd = &fp->tx_chain[bd_idx].start_bd; nbd = le16toh(tx_start_bd->nbd) - 1; #if 0 if ((nbd - 1) > (MAX_MBUF_FRAGS + 2)) { bxe_panic(sc, ("BAD nbd!\n")); } #endif new_cons = (tx_buf->first_bd + nbd); #if 0 struct eth_tx_bd *tx_data_bd; /* * The following code doesn't do anything but is left here * for clarity on what the new value of new_cons skipped. */ /* get the next bd */ bd_idx = TX_BD(TX_BD_NEXT(bd_idx)); /* skip the parse bd */ --nbd; bd_idx = TX_BD(TX_BD_NEXT(bd_idx)); /* skip the TSO split header bd since they have no mapping */ if (tx_buf->flags & BXE_TSO_SPLIT_BD) { --nbd; bd_idx = TX_BD(TX_BD_NEXT(bd_idx)); } /* now free frags */ while (nbd > 0) { tx_data_bd = &fp->tx_chain[bd_idx].reg_bd; if (--nbd) { bd_idx = TX_BD(TX_BD_NEXT(bd_idx)); } } #endif /* free the mbuf */ if (__predict_true(tx_buf->m != NULL)) { m_freem(tx_buf->m); fp->eth_q_stats.mbuf_alloc_tx--; } else { fp->eth_q_stats.tx_chain_lost_mbuf++; } tx_buf->m = NULL; tx_buf->first_bd = 0; return (new_cons); } /* transmit timeout watchdog */ static int bxe_watchdog(struct bxe_softc *sc, struct bxe_fastpath *fp) { BXE_FP_TX_LOCK(fp); if ((fp->watchdog_timer == 0) || (--fp->watchdog_timer)) { BXE_FP_TX_UNLOCK(fp); return (0); } BLOGE(sc, "TX watchdog timeout on fp[%02d], resetting!\n", fp->index); BXE_FP_TX_UNLOCK(fp); atomic_store_rel_long(&sc->chip_tq_flags, CHIP_TQ_REINIT); taskqueue_enqueue(sc->chip_tq, &sc->chip_tq_task); return (-1); } /* processes transmit completions */ static uint8_t bxe_txeof(struct bxe_softc *sc, struct bxe_fastpath *fp) { if_t ifp = sc->ifp; uint16_t bd_cons, hw_cons, sw_cons, pkt_cons; uint16_t tx_bd_avail; BXE_FP_TX_LOCK_ASSERT(fp); bd_cons = fp->tx_bd_cons; hw_cons = le16toh(*fp->tx_cons_sb); sw_cons = fp->tx_pkt_cons; while (sw_cons != hw_cons) { pkt_cons = TX_BD(sw_cons); BLOGD(sc, DBG_TX, "TX: fp[%d]: hw_cons=%u sw_cons=%u pkt_cons=%u\n", fp->index, hw_cons, sw_cons, pkt_cons); bd_cons = bxe_free_tx_pkt(sc, fp, pkt_cons); sw_cons++; } fp->tx_pkt_cons = sw_cons; fp->tx_bd_cons = bd_cons; BLOGD(sc, DBG_TX, "TX done: fp[%d]: hw_cons=%u sw_cons=%u sw_prod=%u\n", fp->index, hw_cons, fp->tx_pkt_cons, fp->tx_pkt_prod); mb(); tx_bd_avail = bxe_tx_avail(sc, fp); if (tx_bd_avail < BXE_TX_CLEANUP_THRESHOLD) { if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); } else { if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); } if (fp->tx_pkt_prod != fp->tx_pkt_cons) { /* reset the watchdog timer if there are pending transmits */ fp->watchdog_timer = BXE_TX_TIMEOUT; return (TRUE); } else { /* clear watchdog when there are no pending transmits */ fp->watchdog_timer = 0; return (FALSE); } } static void bxe_drain_tx_queues(struct bxe_softc *sc) { struct bxe_fastpath *fp; int i, count; /* wait until all TX fastpath tasks have completed */ for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; count = 1000; while (bxe_has_tx_work(fp)) { BXE_FP_TX_LOCK(fp); bxe_txeof(sc, fp); BXE_FP_TX_UNLOCK(fp); if (count == 0) { BLOGE(sc, "Timeout waiting for fp[%d] " "transmits to complete!\n", i); bxe_panic(sc, ("tx drain failure\n")); return; } count--; DELAY(1000); rmb(); } } return; } static int bxe_del_all_macs(struct bxe_softc *sc, struct ecore_vlan_mac_obj *mac_obj, int mac_type, uint8_t wait_for_comp) { unsigned long ramrod_flags = 0, vlan_mac_flags = 0; int rc; /* wait for completion of requested */ if (wait_for_comp) { bxe_set_bit(RAMROD_COMP_WAIT, &ramrod_flags); } /* Set the mac type of addresses we want to clear */ bxe_set_bit(mac_type, &vlan_mac_flags); rc = mac_obj->delete_all(sc, mac_obj, &vlan_mac_flags, &ramrod_flags); if (rc < 0) { BLOGE(sc, "Failed to delete MACs (%d)\n", rc); } return (rc); } static int bxe_fill_accept_flags(struct bxe_softc *sc, uint32_t rx_mode, unsigned long *rx_accept_flags, unsigned long *tx_accept_flags) { /* Clear the flags first */ *rx_accept_flags = 0; *tx_accept_flags = 0; switch (rx_mode) { case BXE_RX_MODE_NONE: /* * 'drop all' supersedes any accept flags that may have been * passed to the function. */ break; case BXE_RX_MODE_NORMAL: bxe_set_bit(ECORE_ACCEPT_UNICAST, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_MULTICAST, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_BROADCAST, rx_accept_flags); /* internal switching mode */ bxe_set_bit(ECORE_ACCEPT_UNICAST, tx_accept_flags); bxe_set_bit(ECORE_ACCEPT_MULTICAST, tx_accept_flags); bxe_set_bit(ECORE_ACCEPT_BROADCAST, tx_accept_flags); break; case BXE_RX_MODE_ALLMULTI: bxe_set_bit(ECORE_ACCEPT_UNICAST, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_ALL_MULTICAST, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_BROADCAST, rx_accept_flags); /* internal switching mode */ bxe_set_bit(ECORE_ACCEPT_UNICAST, tx_accept_flags); bxe_set_bit(ECORE_ACCEPT_ALL_MULTICAST, tx_accept_flags); bxe_set_bit(ECORE_ACCEPT_BROADCAST, tx_accept_flags); break; case BXE_RX_MODE_PROMISC: /* * According to deffinition of SI mode, iface in promisc mode * should receive matched and unmatched (in resolution of port) * unicast packets. */ bxe_set_bit(ECORE_ACCEPT_UNMATCHED, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_UNICAST, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_ALL_MULTICAST, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_BROADCAST, rx_accept_flags); /* internal switching mode */ bxe_set_bit(ECORE_ACCEPT_ALL_MULTICAST, tx_accept_flags); bxe_set_bit(ECORE_ACCEPT_BROADCAST, tx_accept_flags); if (IS_MF_SI(sc)) { bxe_set_bit(ECORE_ACCEPT_ALL_UNICAST, tx_accept_flags); } else { bxe_set_bit(ECORE_ACCEPT_UNICAST, tx_accept_flags); } break; default: BLOGE(sc, "Unknown rx_mode (%d)\n", rx_mode); return (-1); } /* Set ACCEPT_ANY_VLAN as we do not enable filtering by VLAN */ if (rx_mode != BXE_RX_MODE_NONE) { bxe_set_bit(ECORE_ACCEPT_ANY_VLAN, rx_accept_flags); bxe_set_bit(ECORE_ACCEPT_ANY_VLAN, tx_accept_flags); } return (0); } static int bxe_set_q_rx_mode(struct bxe_softc *sc, uint8_t cl_id, unsigned long rx_mode_flags, unsigned long rx_accept_flags, unsigned long tx_accept_flags, unsigned long ramrod_flags) { struct ecore_rx_mode_ramrod_params ramrod_param; int rc; memset(&ramrod_param, 0, sizeof(ramrod_param)); /* Prepare ramrod parameters */ ramrod_param.cid = 0; ramrod_param.cl_id = cl_id; ramrod_param.rx_mode_obj = &sc->rx_mode_obj; ramrod_param.func_id = SC_FUNC(sc); ramrod_param.pstate = &sc->sp_state; ramrod_param.state = ECORE_FILTER_RX_MODE_PENDING; ramrod_param.rdata = BXE_SP(sc, rx_mode_rdata); ramrod_param.rdata_mapping = BXE_SP_MAPPING(sc, rx_mode_rdata); bxe_set_bit(ECORE_FILTER_RX_MODE_PENDING, &sc->sp_state); ramrod_param.ramrod_flags = ramrod_flags; ramrod_param.rx_mode_flags = rx_mode_flags; ramrod_param.rx_accept_flags = rx_accept_flags; ramrod_param.tx_accept_flags = tx_accept_flags; rc = ecore_config_rx_mode(sc, &ramrod_param); if (rc < 0) { BLOGE(sc, "Set rx_mode %d failed\n", sc->rx_mode); return (rc); } return (0); } static int bxe_set_storm_rx_mode(struct bxe_softc *sc) { unsigned long rx_mode_flags = 0, ramrod_flags = 0; unsigned long rx_accept_flags = 0, tx_accept_flags = 0; int rc; rc = bxe_fill_accept_flags(sc, sc->rx_mode, &rx_accept_flags, &tx_accept_flags); if (rc) { return (rc); } bxe_set_bit(RAMROD_RX, &ramrod_flags); bxe_set_bit(RAMROD_TX, &ramrod_flags); /* XXX ensure all fastpath have same cl_id and/or move it to bxe_softc */ return (bxe_set_q_rx_mode(sc, sc->fp[0].cl_id, rx_mode_flags, rx_accept_flags, tx_accept_flags, ramrod_flags)); } /* returns the "mcp load_code" according to global load_count array */ static int bxe_nic_load_no_mcp(struct bxe_softc *sc) { int path = SC_PATH(sc); int port = SC_PORT(sc); BLOGI(sc, "NO MCP - load counts[%d] %d, %d, %d\n", path, load_count[path][0], load_count[path][1], load_count[path][2]); load_count[path][0]++; load_count[path][1 + port]++; BLOGI(sc, "NO MCP - new load counts[%d] %d, %d, %d\n", path, load_count[path][0], load_count[path][1], load_count[path][2]); if (load_count[path][0] == 1) { return (FW_MSG_CODE_DRV_LOAD_COMMON); } else if (load_count[path][1 + port] == 1) { return (FW_MSG_CODE_DRV_LOAD_PORT); } else { return (FW_MSG_CODE_DRV_LOAD_FUNCTION); } } /* returns the "mcp load_code" according to global load_count array */ static int bxe_nic_unload_no_mcp(struct bxe_softc *sc) { int port = SC_PORT(sc); int path = SC_PATH(sc); BLOGI(sc, "NO MCP - load counts[%d] %d, %d, %d\n", path, load_count[path][0], load_count[path][1], load_count[path][2]); load_count[path][0]--; load_count[path][1 + port]--; BLOGI(sc, "NO MCP - new load counts[%d] %d, %d, %d\n", path, load_count[path][0], load_count[path][1], load_count[path][2]); if (load_count[path][0] == 0) { return (FW_MSG_CODE_DRV_UNLOAD_COMMON); } else if (load_count[path][1 + port] == 0) { return (FW_MSG_CODE_DRV_UNLOAD_PORT); } else { return (FW_MSG_CODE_DRV_UNLOAD_FUNCTION); } } /* request unload mode from the MCP: COMMON, PORT or FUNCTION */ static uint32_t bxe_send_unload_req(struct bxe_softc *sc, int unload_mode) { uint32_t reset_code = 0; #if 0 int port = SC_PORT(sc); int path = SC_PATH(sc); #endif /* Select the UNLOAD request mode */ if (unload_mode == UNLOAD_NORMAL) { reset_code = DRV_MSG_CODE_UNLOAD_REQ_WOL_DIS; } #if 0 else if (sc->flags & BXE_NO_WOL_FLAG) { reset_code = DRV_MSG_CODE_UNLOAD_REQ_WOL_MCP; } else if (sc->wol) { uint32_t emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0; uint8_t *mac_addr = sc->dev->dev_addr; uint32_t val; uint16_t pmc; /* * The mac address is written to entries 1-4 to * preserve entry 0 which is used by the PMF */ uint8_t entry = (SC_VN(sc) + 1)*8; val = (mac_addr[0] << 8) | mac_addr[1]; EMAC_WR(sc, EMAC_REG_EMAC_MAC_MATCH + entry, val); val = (mac_addr[2] << 24) | (mac_addr[3] << 16) | (mac_addr[4] << 8) | mac_addr[5]; EMAC_WR(sc, EMAC_REG_EMAC_MAC_MATCH + entry + 4, val); /* Enable the PME and clear the status */ pmc = pci_read_config(sc->dev, (sc->devinfo.pcie_pm_cap_reg + PCIR_POWER_STATUS), 2); pmc |= PCIM_PSTAT_PMEENABLE | PCIM_PSTAT_PME; pci_write_config(sc->dev, (sc->devinfo.pcie_pm_cap_reg + PCIR_POWER_STATUS), pmc, 4); reset_code = DRV_MSG_CODE_UNLOAD_REQ_WOL_EN; } #endif else { reset_code = DRV_MSG_CODE_UNLOAD_REQ_WOL_DIS; } /* Send the request to the MCP */ if (!BXE_NOMCP(sc)) { reset_code = bxe_fw_command(sc, reset_code, 0); } else { reset_code = bxe_nic_unload_no_mcp(sc); } return (reset_code); } /* send UNLOAD_DONE command to the MCP */ static void bxe_send_unload_done(struct bxe_softc *sc, uint8_t keep_link) { uint32_t reset_param = keep_link ? DRV_MSG_CODE_UNLOAD_SKIP_LINK_RESET : 0; /* Report UNLOAD_DONE to MCP */ if (!BXE_NOMCP(sc)) { bxe_fw_command(sc, DRV_MSG_CODE_UNLOAD_DONE, reset_param); } } static int bxe_func_wait_started(struct bxe_softc *sc) { int tout = 50; if (!sc->port.pmf) { return (0); } /* * (assumption: No Attention from MCP at this stage) * PMF probably in the middle of TX disable/enable transaction * 1. Sync IRS for default SB * 2. Sync SP queue - this guarantees us that attention handling started * 3. Wait, that TX disable/enable transaction completes * * 1+2 guarantee that if DCBX attention was scheduled it already changed * pending bit of transaction from STARTED-->TX_STOPPED, if we already * received completion for the transaction the state is TX_STOPPED. * State will return to STARTED after completion of TX_STOPPED-->STARTED * transaction. */ /* XXX make sure default SB ISR is done */ /* need a way to synchronize an irq (intr_mtx?) */ /* XXX flush any work queues */ while (ecore_func_get_state(sc, &sc->func_obj) != ECORE_F_STATE_STARTED && tout--) { DELAY(20000); } if (ecore_func_get_state(sc, &sc->func_obj) != ECORE_F_STATE_STARTED) { /* * Failed to complete the transaction in a "good way" * Force both transactions with CLR bit. */ struct ecore_func_state_params func_params = { NULL }; BLOGE(sc, "Unexpected function state! " "Forcing STARTED-->TX_STOPPED-->STARTED\n"); func_params.f_obj = &sc->func_obj; bxe_set_bit(RAMROD_DRV_CLR_ONLY, &func_params.ramrod_flags); /* STARTED-->TX_STOPPED */ func_params.cmd = ECORE_F_CMD_TX_STOP; ecore_func_state_change(sc, &func_params); /* TX_STOPPED-->STARTED */ func_params.cmd = ECORE_F_CMD_TX_START; return (ecore_func_state_change(sc, &func_params)); } return (0); } static int bxe_stop_queue(struct bxe_softc *sc, int index) { struct bxe_fastpath *fp = &sc->fp[index]; struct ecore_queue_state_params q_params = { NULL }; int rc; BLOGD(sc, DBG_LOAD, "stopping queue %d cid %d\n", index, fp->index); q_params.q_obj = &sc->sp_objs[fp->index].q_obj; /* We want to wait for completion in this context */ bxe_set_bit(RAMROD_COMP_WAIT, &q_params.ramrod_flags); /* Stop the primary connection: */ /* ...halt the connection */ q_params.cmd = ECORE_Q_CMD_HALT; rc = ecore_queue_state_change(sc, &q_params); if (rc) { return (rc); } /* ...terminate the connection */ q_params.cmd = ECORE_Q_CMD_TERMINATE; memset(&q_params.params.terminate, 0, sizeof(q_params.params.terminate)); q_params.params.terminate.cid_index = FIRST_TX_COS_INDEX; rc = ecore_queue_state_change(sc, &q_params); if (rc) { return (rc); } /* ...delete cfc entry */ q_params.cmd = ECORE_Q_CMD_CFC_DEL; memset(&q_params.params.cfc_del, 0, sizeof(q_params.params.cfc_del)); q_params.params.cfc_del.cid_index = FIRST_TX_COS_INDEX; return (ecore_queue_state_change(sc, &q_params)); } /* wait for the outstanding SP commands */ static inline uint8_t bxe_wait_sp_comp(struct bxe_softc *sc, unsigned long mask) { unsigned long tmp; int tout = 5000; /* wait for 5 secs tops */ while (tout--) { mb(); if (!(atomic_load_acq_long(&sc->sp_state) & mask)) { return (TRUE); } DELAY(1000); } mb(); tmp = atomic_load_acq_long(&sc->sp_state); if (tmp & mask) { BLOGE(sc, "Filtering completion timed out: " "sp_state 0x%lx, mask 0x%lx\n", tmp, mask); return (FALSE); } return (FALSE); } static int bxe_func_stop(struct bxe_softc *sc) { struct ecore_func_state_params func_params = { NULL }; int rc; /* prepare parameters for function state transitions */ bxe_set_bit(RAMROD_COMP_WAIT, &func_params.ramrod_flags); func_params.f_obj = &sc->func_obj; func_params.cmd = ECORE_F_CMD_STOP; /* * Try to stop the function the 'good way'. If it fails (in case * of a parity error during bxe_chip_cleanup()) and we are * not in a debug mode, perform a state transaction in order to * enable further HW_RESET transaction. */ rc = ecore_func_state_change(sc, &func_params); if (rc) { BLOGE(sc, "FUNC_STOP ramrod failed. " "Running a dry transaction\n"); bxe_set_bit(RAMROD_DRV_CLR_ONLY, &func_params.ramrod_flags); return (ecore_func_state_change(sc, &func_params)); } return (0); } static int bxe_reset_hw(struct bxe_softc *sc, uint32_t load_code) { struct ecore_func_state_params func_params = { NULL }; /* Prepare parameters for function state transitions */ bxe_set_bit(RAMROD_COMP_WAIT, &func_params.ramrod_flags); func_params.f_obj = &sc->func_obj; func_params.cmd = ECORE_F_CMD_HW_RESET; func_params.params.hw_init.load_phase = load_code; return (ecore_func_state_change(sc, &func_params)); } static void bxe_int_disable_sync(struct bxe_softc *sc, int disable_hw) { if (disable_hw) { /* prevent the HW from sending interrupts */ bxe_int_disable(sc); } /* XXX need a way to synchronize ALL irqs (intr_mtx?) */ /* make sure all ISRs are done */ /* XXX make sure sp_task is not running */ /* cancel and flush work queues */ } static void bxe_chip_cleanup(struct bxe_softc *sc, uint32_t unload_mode, uint8_t keep_link) { int port = SC_PORT(sc); struct ecore_mcast_ramrod_params rparam = { NULL }; uint32_t reset_code; int i, rc = 0; bxe_drain_tx_queues(sc); /* give HW time to discard old tx messages */ DELAY(1000); /* Clean all ETH MACs */ rc = bxe_del_all_macs(sc, &sc->sp_objs[0].mac_obj, ECORE_ETH_MAC, FALSE); if (rc < 0) { BLOGE(sc, "Failed to delete all ETH MACs (%d)\n", rc); } /* Clean up UC list */ rc = bxe_del_all_macs(sc, &sc->sp_objs[0].mac_obj, ECORE_UC_LIST_MAC, TRUE); if (rc < 0) { BLOGE(sc, "Failed to delete UC MACs list (%d)\n", rc); } /* Disable LLH */ if (!CHIP_IS_E1(sc)) { REG_WR(sc, NIG_REG_LLH0_FUNC_EN + port*8, 0); } /* Set "drop all" to stop Rx */ /* * We need to take the BXE_MCAST_LOCK() here in order to prevent * a race between the completion code and this code. */ BXE_MCAST_LOCK(sc); if (bxe_test_bit(ECORE_FILTER_RX_MODE_PENDING, &sc->sp_state)) { bxe_set_bit(ECORE_FILTER_RX_MODE_SCHED, &sc->sp_state); } else { bxe_set_storm_rx_mode(sc); } /* Clean up multicast configuration */ rparam.mcast_obj = &sc->mcast_obj; rc = ecore_config_mcast(sc, &rparam, ECORE_MCAST_CMD_DEL); if (rc < 0) { BLOGE(sc, "Failed to send DEL MCAST command (%d)\n", rc); } BXE_MCAST_UNLOCK(sc); // XXX bxe_iov_chip_cleanup(sc); /* * Send the UNLOAD_REQUEST to the MCP. This will return if * this function should perform FUNCTION, PORT, or COMMON HW * reset. */ reset_code = bxe_send_unload_req(sc, unload_mode); /* * (assumption: No Attention from MCP at this stage) * PMF probably in the middle of TX disable/enable transaction */ rc = bxe_func_wait_started(sc); if (rc) { BLOGE(sc, "bxe_func_wait_started failed\n"); } /* * Close multi and leading connections * Completions for ramrods are collected in a synchronous way */ for (i = 0; i < sc->num_queues; i++) { if (bxe_stop_queue(sc, i)) { goto unload_error; } } /* * If SP settings didn't get completed so far - something * very wrong has happen. */ if (!bxe_wait_sp_comp(sc, ~0x0UL)) { BLOGE(sc, "Common slow path ramrods got stuck!\n"); } unload_error: rc = bxe_func_stop(sc); if (rc) { BLOGE(sc, "Function stop failed!\n"); } /* disable HW interrupts */ bxe_int_disable_sync(sc, TRUE); /* detach interrupts */ bxe_interrupt_detach(sc); /* Reset the chip */ rc = bxe_reset_hw(sc, reset_code); if (rc) { BLOGE(sc, "Hardware reset failed\n"); } /* Report UNLOAD_DONE to MCP */ bxe_send_unload_done(sc, keep_link); } static void bxe_disable_close_the_gate(struct bxe_softc *sc) { uint32_t val; int port = SC_PORT(sc); BLOGD(sc, DBG_LOAD, "Disabling 'close the gates'\n"); if (CHIP_IS_E1(sc)) { uint32_t addr = port ? MISC_REG_AEU_MASK_ATTN_FUNC_1 : MISC_REG_AEU_MASK_ATTN_FUNC_0; val = REG_RD(sc, addr); val &= ~(0x300); REG_WR(sc, addr, val); } else { val = REG_RD(sc, MISC_REG_AEU_GENERAL_MASK); val &= ~(MISC_AEU_GENERAL_MASK_REG_AEU_PXP_CLOSE_MASK | MISC_AEU_GENERAL_MASK_REG_AEU_NIG_CLOSE_MASK); REG_WR(sc, MISC_REG_AEU_GENERAL_MASK, val); } } /* * Cleans the object that have internal lists without sending * ramrods. Should be run when interrutps are disabled. */ static void bxe_squeeze_objects(struct bxe_softc *sc) { unsigned long ramrod_flags = 0, vlan_mac_flags = 0; struct ecore_mcast_ramrod_params rparam = { NULL }; struct ecore_vlan_mac_obj *mac_obj = &sc->sp_objs->mac_obj; int rc; /* Cleanup MACs' object first... */ /* Wait for completion of requested */ bxe_set_bit(RAMROD_COMP_WAIT, &ramrod_flags); /* Perform a dry cleanup */ bxe_set_bit(RAMROD_DRV_CLR_ONLY, &ramrod_flags); /* Clean ETH primary MAC */ bxe_set_bit(ECORE_ETH_MAC, &vlan_mac_flags); rc = mac_obj->delete_all(sc, &sc->sp_objs->mac_obj, &vlan_mac_flags, &ramrod_flags); if (rc != 0) { BLOGE(sc, "Failed to clean ETH MACs (%d)\n", rc); } /* Cleanup UC list */ vlan_mac_flags = 0; bxe_set_bit(ECORE_UC_LIST_MAC, &vlan_mac_flags); rc = mac_obj->delete_all(sc, mac_obj, &vlan_mac_flags, &ramrod_flags); if (rc != 0) { BLOGE(sc, "Failed to clean UC list MACs (%d)\n", rc); } /* Now clean mcast object... */ rparam.mcast_obj = &sc->mcast_obj; bxe_set_bit(RAMROD_DRV_CLR_ONLY, &rparam.ramrod_flags); /* Add a DEL command... */ rc = ecore_config_mcast(sc, &rparam, ECORE_MCAST_CMD_DEL); if (rc < 0) { BLOGE(sc, "Failed to send DEL MCAST command (%d)\n", rc); } /* now wait until all pending commands are cleared */ rc = ecore_config_mcast(sc, &rparam, ECORE_MCAST_CMD_CONT); while (rc != 0) { if (rc < 0) { BLOGE(sc, "Failed to clean MCAST object (%d)\n", rc); return; } rc = ecore_config_mcast(sc, &rparam, ECORE_MCAST_CMD_CONT); } } /* stop the controller */ static __noinline int bxe_nic_unload(struct bxe_softc *sc, uint32_t unload_mode, uint8_t keep_link) { uint8_t global = FALSE; uint32_t val; BXE_CORE_LOCK_ASSERT(sc); BLOGD(sc, DBG_LOAD, "Starting NIC unload...\n"); /* mark driver as unloaded in shmem2 */ if (IS_PF(sc) && SHMEM2_HAS(sc, drv_capabilities_flag)) { val = SHMEM2_RD(sc, drv_capabilities_flag[SC_FW_MB_IDX(sc)]); SHMEM2_WR(sc, drv_capabilities_flag[SC_FW_MB_IDX(sc)], val & ~DRV_FLAGS_CAPABILITIES_LOADED_L2); } if (IS_PF(sc) && sc->recovery_state != BXE_RECOVERY_DONE && (sc->state == BXE_STATE_CLOSED || sc->state == BXE_STATE_ERROR)) { /* * We can get here if the driver has been unloaded * during parity error recovery and is either waiting for a * leader to complete or for other functions to unload and * then ifconfig down has been issued. In this case we want to * unload and let other functions to complete a recovery * process. */ sc->recovery_state = BXE_RECOVERY_DONE; sc->is_leader = 0; bxe_release_leader_lock(sc); mb(); BLOGD(sc, DBG_LOAD, "Releasing a leadership...\n"); BLOGE(sc, "Can't unload in closed or error state\n"); return (-1); } /* * Nothing to do during unload if previous bxe_nic_load() * did not completed succesfully - all resourses are released. */ if ((sc->state == BXE_STATE_CLOSED) || (sc->state == BXE_STATE_ERROR)) { return (0); } sc->state = BXE_STATE_CLOSING_WAITING_HALT; mb(); /* stop tx */ bxe_tx_disable(sc); sc->rx_mode = BXE_RX_MODE_NONE; /* XXX set rx mode ??? */ if (IS_PF(sc)) { /* set ALWAYS_ALIVE bit in shmem */ sc->fw_drv_pulse_wr_seq |= DRV_PULSE_ALWAYS_ALIVE; bxe_drv_pulse(sc); bxe_stats_handle(sc, STATS_EVENT_STOP); bxe_save_statistics(sc); } /* wait till consumers catch up with producers in all queues */ bxe_drain_tx_queues(sc); /* if VF indicate to PF this function is going down (PF will delete sp * elements and clear initializations */ if (IS_VF(sc)) { ; /* bxe_vfpf_close_vf(sc); */ } else if (unload_mode != UNLOAD_RECOVERY) { /* if this is a normal/close unload need to clean up chip */ bxe_chip_cleanup(sc, unload_mode, keep_link); } else { /* Send the UNLOAD_REQUEST to the MCP */ bxe_send_unload_req(sc, unload_mode); /* * Prevent transactions to host from the functions on the * engine that doesn't reset global blocks in case of global * attention once gloabl blocks are reset and gates are opened * (the engine which leader will perform the recovery * last). */ if (!CHIP_IS_E1x(sc)) { bxe_pf_disable(sc); } /* disable HW interrupts */ bxe_int_disable_sync(sc, TRUE); /* detach interrupts */ bxe_interrupt_detach(sc); /* Report UNLOAD_DONE to MCP */ bxe_send_unload_done(sc, FALSE); } /* * At this stage no more interrupts will arrive so we may safely clean * the queue'able objects here in case they failed to get cleaned so far. */ if (IS_PF(sc)) { bxe_squeeze_objects(sc); } /* There should be no more pending SP commands at this stage */ sc->sp_state = 0; sc->port.pmf = 0; bxe_free_fp_buffers(sc); if (IS_PF(sc)) { bxe_free_mem(sc); } bxe_free_fw_stats_mem(sc); sc->state = BXE_STATE_CLOSED; /* * Check if there are pending parity attentions. If there are - set * RECOVERY_IN_PROGRESS. */ if (IS_PF(sc) && bxe_chk_parity_attn(sc, &global, FALSE)) { bxe_set_reset_in_progress(sc); /* Set RESET_IS_GLOBAL if needed */ if (global) { bxe_set_reset_global(sc); } } /* * The last driver must disable a "close the gate" if there is no * parity attention or "process kill" pending. */ if (IS_PF(sc) && !bxe_clear_pf_load(sc) && bxe_reset_is_done(sc, SC_PATH(sc))) { bxe_disable_close_the_gate(sc); } BLOGD(sc, DBG_LOAD, "Ended NIC unload\n"); return (0); } /* * Called by the OS to set various media options (i.e. link, speed, etc.) when * the user runs "ifconfig bxe media ..." or "ifconfig bxe mediaopt ...". */ static int bxe_ifmedia_update(struct ifnet *ifp) { struct bxe_softc *sc = (struct bxe_softc *)if_getsoftc(ifp); struct ifmedia *ifm; ifm = &sc->ifmedia; /* We only support Ethernet media type. */ if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) { return (EINVAL); } switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: break; case IFM_10G_CX4: case IFM_10G_SR: case IFM_10G_T: case IFM_10G_TWINAX: default: /* We don't support changing the media type. */ BLOGD(sc, DBG_LOAD, "Invalid media type (%d)\n", IFM_SUBTYPE(ifm->ifm_media)); return (EINVAL); } return (0); } /* * Called by the OS to get the current media status (i.e. link, speed, etc.). */ static void bxe_ifmedia_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct bxe_softc *sc = if_getsoftc(ifp); /* Report link down if the driver isn't running. */ if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { ifmr->ifm_active |= IFM_NONE; return; } /* Setup the default interface info. */ ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (sc->link_vars.link_up) { ifmr->ifm_status |= IFM_ACTIVE; } else { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_active |= sc->media; if (sc->link_vars.duplex == DUPLEX_FULL) { ifmr->ifm_active |= IFM_FDX; } else { ifmr->ifm_active |= IFM_HDX; } } static int bxe_ioctl_nvram(struct bxe_softc *sc, uint32_t priv_op, struct ifreq *ifr) { struct bxe_nvram_data nvdata_base; struct bxe_nvram_data *nvdata; int len; int error = 0; copyin(ifr->ifr_data, &nvdata_base, sizeof(nvdata_base)); len = (sizeof(struct bxe_nvram_data) + nvdata_base.len - sizeof(uint32_t)); if (len > sizeof(struct bxe_nvram_data)) { if ((nvdata = (struct bxe_nvram_data *) malloc(len, M_DEVBUF, (M_NOWAIT | M_ZERO))) == NULL) { BLOGE(sc, "BXE_IOC_RD_NVRAM malloc failed\n"); return (1); } memcpy(nvdata, &nvdata_base, sizeof(struct bxe_nvram_data)); } else { nvdata = &nvdata_base; } if (priv_op == BXE_IOC_RD_NVRAM) { BLOGD(sc, DBG_IOCTL, "IOC_RD_NVRAM 0x%x %d\n", nvdata->offset, nvdata->len); error = bxe_nvram_read(sc, nvdata->offset, (uint8_t *)nvdata->value, nvdata->len); copyout(nvdata, ifr->ifr_data, len); } else { /* BXE_IOC_WR_NVRAM */ BLOGD(sc, DBG_IOCTL, "IOC_WR_NVRAM 0x%x %d\n", nvdata->offset, nvdata->len); copyin(ifr->ifr_data, nvdata, len); error = bxe_nvram_write(sc, nvdata->offset, (uint8_t *)nvdata->value, nvdata->len); } if (len > sizeof(struct bxe_nvram_data)) { free(nvdata, M_DEVBUF); } return (error); } static int bxe_ioctl_stats_show(struct bxe_softc *sc, uint32_t priv_op, struct ifreq *ifr) { const size_t str_size = (BXE_NUM_ETH_STATS * STAT_NAME_LEN); const size_t stats_size = (BXE_NUM_ETH_STATS * sizeof(uint64_t)); caddr_t p_tmp; uint32_t *offset; int i; switch (priv_op) { case BXE_IOC_STATS_SHOW_NUM: memset(ifr->ifr_data, 0, sizeof(union bxe_stats_show_data)); ((union bxe_stats_show_data *)ifr->ifr_data)->desc.num = BXE_NUM_ETH_STATS; ((union bxe_stats_show_data *)ifr->ifr_data)->desc.len = STAT_NAME_LEN; return (0); case BXE_IOC_STATS_SHOW_STR: memset(ifr->ifr_data, 0, str_size); p_tmp = ifr->ifr_data; for (i = 0; i < BXE_NUM_ETH_STATS; i++) { strcpy(p_tmp, bxe_eth_stats_arr[i].string); p_tmp += STAT_NAME_LEN; } return (0); case BXE_IOC_STATS_SHOW_CNT: memset(ifr->ifr_data, 0, stats_size); p_tmp = ifr->ifr_data; for (i = 0; i < BXE_NUM_ETH_STATS; i++) { offset = ((uint32_t *)&sc->eth_stats + bxe_eth_stats_arr[i].offset); switch (bxe_eth_stats_arr[i].size) { case 4: *((uint64_t *)p_tmp) = (uint64_t)*offset; break; case 8: *((uint64_t *)p_tmp) = HILO_U64(*offset, *(offset + 1)); break; default: *((uint64_t *)p_tmp) = 0; } p_tmp += sizeof(uint64_t); } return (0); default: return (-1); } } static void bxe_handle_chip_tq(void *context, int pending) { struct bxe_softc *sc = (struct bxe_softc *)context; long work = atomic_load_acq_long(&sc->chip_tq_flags); switch (work) { case CHIP_TQ_START: if ((if_getflags(sc->ifp) & IFF_UP) && !(if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)) { /* start the interface */ BLOGD(sc, DBG_LOAD, "Starting the interface...\n"); BXE_CORE_LOCK(sc); bxe_init_locked(sc); BXE_CORE_UNLOCK(sc); } break; case CHIP_TQ_STOP: if (!(if_getflags(sc->ifp) & IFF_UP) && (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)) { /* bring down the interface */ BLOGD(sc, DBG_LOAD, "Stopping the interface...\n"); bxe_periodic_stop(sc); BXE_CORE_LOCK(sc); bxe_stop_locked(sc); BXE_CORE_UNLOCK(sc); } break; case CHIP_TQ_REINIT: if (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING) { /* restart the interface */ BLOGD(sc, DBG_LOAD, "Restarting the interface...\n"); bxe_periodic_stop(sc); BXE_CORE_LOCK(sc); bxe_stop_locked(sc); bxe_init_locked(sc); BXE_CORE_UNLOCK(sc); } break; default: break; } } /* * Handles any IOCTL calls from the operating system. * * Returns: * 0 = Success, >0 Failure */ static int bxe_ioctl(if_t ifp, u_long command, caddr_t data) { struct bxe_softc *sc = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data; struct bxe_nvram_data *nvdata; uint32_t priv_op; int mask = 0; int reinit = 0; int error = 0; int mtu_min = (ETH_MIN_PACKET_SIZE - ETH_HLEN); int mtu_max = (MJUM9BYTES - ETH_OVERHEAD - IP_HEADER_ALIGNMENT_PADDING); switch (command) { case SIOCSIFMTU: BLOGD(sc, DBG_IOCTL, "Received SIOCSIFMTU ioctl (mtu=%d)\n", ifr->ifr_mtu); if (sc->mtu == ifr->ifr_mtu) { /* nothing to change */ break; } if ((ifr->ifr_mtu < mtu_min) || (ifr->ifr_mtu > mtu_max)) { BLOGE(sc, "Unsupported MTU size %d (range is %d-%d)\n", ifr->ifr_mtu, mtu_min, mtu_max); error = EINVAL; break; } atomic_store_rel_int((volatile unsigned int *)&sc->mtu, (unsigned long)ifr->ifr_mtu); /* atomic_store_rel_long((volatile unsigned long *)&if_getmtu(ifp), (unsigned long)ifr->ifr_mtu); XXX - Not sure why it needs to be atomic */ if_setmtu(ifp, ifr->ifr_mtu); reinit = 1; break; case SIOCSIFFLAGS: /* toggle the interface state up or down */ BLOGD(sc, DBG_IOCTL, "Received SIOCSIFFLAGS ioctl\n"); /* check if the interface is up */ if (if_getflags(ifp) & IFF_UP) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { /* set the receive mode flags */ bxe_set_rx_mode(sc); } else { atomic_store_rel_long(&sc->chip_tq_flags, CHIP_TQ_START); taskqueue_enqueue(sc->chip_tq, &sc->chip_tq_task); } } else { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { atomic_store_rel_long(&sc->chip_tq_flags, CHIP_TQ_STOP); taskqueue_enqueue(sc->chip_tq, &sc->chip_tq_task); } } break; case SIOCADDMULTI: case SIOCDELMULTI: /* add/delete multicast addresses */ BLOGD(sc, DBG_IOCTL, "Received SIOCADDMULTI/SIOCDELMULTI ioctl\n"); /* check if the interface is up */ if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { /* set the receive mode flags */ bxe_set_rx_mode(sc); } break; case SIOCSIFCAP: /* find out which capabilities have changed */ mask = (ifr->ifr_reqcap ^ if_getcapenable(ifp)); BLOGD(sc, DBG_IOCTL, "Received SIOCSIFCAP ioctl (mask=0x%08x)\n", mask); /* toggle the LRO capabilites enable flag */ if (mask & IFCAP_LRO) { if_togglecapenable(ifp, IFCAP_LRO); BLOGD(sc, DBG_IOCTL, "Turning LRO %s\n", (if_getcapenable(ifp) & IFCAP_LRO) ? "ON" : "OFF"); reinit = 1; } /* toggle the TXCSUM checksum capabilites enable flag */ if (mask & IFCAP_TXCSUM) { if_togglecapenable(ifp, IFCAP_TXCSUM); BLOGD(sc, DBG_IOCTL, "Turning TXCSUM %s\n", (if_getcapenable(ifp) & IFCAP_TXCSUM) ? "ON" : "OFF"); if (if_getcapenable(ifp) & IFCAP_TXCSUM) { if_sethwassistbits(ifp, (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_TSO | CSUM_TCP_IPV6 | CSUM_UDP_IPV6), 0); } else { if_clearhwassist(ifp); /* XXX */ } } /* toggle the RXCSUM checksum capabilities enable flag */ if (mask & IFCAP_RXCSUM) { if_togglecapenable(ifp, IFCAP_RXCSUM); BLOGD(sc, DBG_IOCTL, "Turning RXCSUM %s\n", (if_getcapenable(ifp) & IFCAP_RXCSUM) ? "ON" : "OFF"); if (if_getcapenable(ifp) & IFCAP_RXCSUM) { if_sethwassistbits(ifp, (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_TSO | CSUM_TCP_IPV6 | CSUM_UDP_IPV6), 0); } else { if_clearhwassist(ifp); /* XXX */ } } /* toggle TSO4 capabilities enabled flag */ if (mask & IFCAP_TSO4) { if_togglecapenable(ifp, IFCAP_TSO4); BLOGD(sc, DBG_IOCTL, "Turning TSO4 %s\n", (if_getcapenable(ifp) & IFCAP_TSO4) ? "ON" : "OFF"); } /* toggle TSO6 capabilities enabled flag */ if (mask & IFCAP_TSO6) { if_togglecapenable(ifp, IFCAP_TSO6); BLOGD(sc, DBG_IOCTL, "Turning TSO6 %s\n", (if_getcapenable(ifp) & IFCAP_TSO6) ? "ON" : "OFF"); } /* toggle VLAN_HWTSO capabilities enabled flag */ if (mask & IFCAP_VLAN_HWTSO) { if_togglecapenable(ifp, IFCAP_VLAN_HWTSO); BLOGD(sc, DBG_IOCTL, "Turning VLAN_HWTSO %s\n", (if_getcapenable(ifp) & IFCAP_VLAN_HWTSO) ? "ON" : "OFF"); } /* toggle VLAN_HWCSUM capabilities enabled flag */ if (mask & IFCAP_VLAN_HWCSUM) { /* XXX investigate this... */ BLOGE(sc, "Changing VLAN_HWCSUM is not supported!\n"); error = EINVAL; } /* toggle VLAN_MTU capabilities enable flag */ if (mask & IFCAP_VLAN_MTU) { /* XXX investigate this... */ BLOGE(sc, "Changing VLAN_MTU is not supported!\n"); error = EINVAL; } /* toggle VLAN_HWTAGGING capabilities enabled flag */ if (mask & IFCAP_VLAN_HWTAGGING) { /* XXX investigate this... */ BLOGE(sc, "Changing VLAN_HWTAGGING is not supported!\n"); error = EINVAL; } /* toggle VLAN_HWFILTER capabilities enabled flag */ if (mask & IFCAP_VLAN_HWFILTER) { /* XXX investigate this... */ BLOGE(sc, "Changing VLAN_HWFILTER is not supported!\n"); error = EINVAL; } /* XXX not yet... * IFCAP_WOL_MAGIC */ break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: /* set/get interface media */ BLOGD(sc, DBG_IOCTL, "Received SIOCSIFMEDIA/SIOCGIFMEDIA ioctl (cmd=%lu)\n", (command & 0xff)); error = ifmedia_ioctl(ifp, ifr, &sc->ifmedia, command); break; case SIOCGPRIVATE_0: copyin(ifr->ifr_data, &priv_op, sizeof(priv_op)); switch (priv_op) { case BXE_IOC_RD_NVRAM: case BXE_IOC_WR_NVRAM: nvdata = (struct bxe_nvram_data *)ifr->ifr_data; BLOGD(sc, DBG_IOCTL, "Received Private NVRAM ioctl addr=0x%x size=%u\n", nvdata->offset, nvdata->len); error = bxe_ioctl_nvram(sc, priv_op, ifr); break; case BXE_IOC_STATS_SHOW_NUM: case BXE_IOC_STATS_SHOW_STR: case BXE_IOC_STATS_SHOW_CNT: BLOGD(sc, DBG_IOCTL, "Received Private Stats ioctl (%d)\n", priv_op); error = bxe_ioctl_stats_show(sc, priv_op, ifr); break; default: BLOGW(sc, "Received Private Unknown ioctl (%d)\n", priv_op); error = EINVAL; break; } break; default: BLOGD(sc, DBG_IOCTL, "Received Unknown Ioctl (cmd=%lu)\n", (command & 0xff)); error = ether_ioctl(ifp, command, data); break; } if (reinit && (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)) { BLOGD(sc, DBG_LOAD | DBG_IOCTL, "Re-initializing hardware from IOCTL change\n"); atomic_store_rel_long(&sc->chip_tq_flags, CHIP_TQ_REINIT); taskqueue_enqueue(sc->chip_tq, &sc->chip_tq_task); } return (error); } static __noinline void bxe_dump_mbuf(struct bxe_softc *sc, struct mbuf *m, uint8_t contents) { char * type; int i = 0; if (!(sc->debug & DBG_MBUF)) { return; } if (m == NULL) { BLOGD(sc, DBG_MBUF, "mbuf: null pointer\n"); return; } while (m) { BLOGD(sc, DBG_MBUF, "%02d: mbuf=%p m_len=%d m_flags=0x%b m_data=%p\n", i, m, m->m_len, m->m_flags, M_FLAG_BITS, m->m_data); if (m->m_flags & M_PKTHDR) { BLOGD(sc, DBG_MBUF, "%02d: - m_pkthdr: tot_len=%d flags=0x%b csum_flags=%b\n", i, m->m_pkthdr.len, m->m_flags, M_FLAG_BITS, (int)m->m_pkthdr.csum_flags, CSUM_BITS); } if (m->m_flags & M_EXT) { switch (m->m_ext.ext_type) { case EXT_CLUSTER: type = "EXT_CLUSTER"; break; case EXT_SFBUF: type = "EXT_SFBUF"; break; case EXT_JUMBOP: type = "EXT_JUMBOP"; break; case EXT_JUMBO9: type = "EXT_JUMBO9"; break; case EXT_JUMBO16: type = "EXT_JUMBO16"; break; case EXT_PACKET: type = "EXT_PACKET"; break; case EXT_MBUF: type = "EXT_MBUF"; break; case EXT_NET_DRV: type = "EXT_NET_DRV"; break; case EXT_MOD_TYPE: type = "EXT_MOD_TYPE"; break; case EXT_DISPOSABLE: type = "EXT_DISPOSABLE"; break; case EXT_EXTREF: type = "EXT_EXTREF"; break; default: type = "UNKNOWN"; break; } BLOGD(sc, DBG_MBUF, "%02d: - m_ext: %p ext_size=%d type=%s\n", i, m->m_ext.ext_buf, m->m_ext.ext_size, type); } if (contents) { bxe_dump_mbuf_data(sc, "mbuf data", m, TRUE); } m = m->m_next; i++; } } /* * Checks to ensure the 13 bd sliding window is >= MSS for TSO. * Check that (13 total bds - 3 bds) = 10 bd window >= MSS. * The window: 3 bds are = 1 for headers BD + 2 for parse BD and last BD * The headers comes in a seperate bd in FreeBSD so 13-3=10. * Returns: 0 if OK to send, 1 if packet needs further defragmentation */ static int bxe_chktso_window(struct bxe_softc *sc, int nsegs, bus_dma_segment_t *segs, struct mbuf *m) { uint32_t num_wnds, wnd_size, wnd_sum; int32_t frag_idx, wnd_idx; unsigned short lso_mss; int defrag; defrag = 0; wnd_sum = 0; wnd_size = 10; num_wnds = nsegs - wnd_size; lso_mss = htole16(m->m_pkthdr.tso_segsz); /* * Total header lengths Eth+IP+TCP in first FreeBSD mbuf so calculate the * first window sum of data while skipping the first assuming it is the * header in FreeBSD. */ for (frag_idx = 1; (frag_idx <= wnd_size); frag_idx++) { wnd_sum += htole16(segs[frag_idx].ds_len); } /* check the first 10 bd window size */ if (wnd_sum < lso_mss) { return (1); } /* run through the windows */ for (wnd_idx = 0; wnd_idx < num_wnds; wnd_idx++, frag_idx++) { /* subtract the first mbuf->m_len of the last wndw(-header) */ wnd_sum -= htole16(segs[wnd_idx+1].ds_len); /* add the next mbuf len to the len of our new window */ wnd_sum += htole16(segs[frag_idx].ds_len); if (wnd_sum < lso_mss) { return (1); } } return (0); } static uint8_t bxe_set_pbd_csum_e2(struct bxe_fastpath *fp, struct mbuf *m, uint32_t *parsing_data) { struct ether_vlan_header *eh = NULL; struct ip *ip4 = NULL; struct ip6_hdr *ip6 = NULL; caddr_t ip = NULL; struct tcphdr *th = NULL; int e_hlen, ip_hlen, l4_off; uint16_t proto; if (m->m_pkthdr.csum_flags == CSUM_IP) { /* no L4 checksum offload needed */ return (0); } /* get the Ethernet header */ eh = mtod(m, struct ether_vlan_header *); /* handle VLAN encapsulation if present */ if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { e_hlen = (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); proto = ntohs(eh->evl_proto); } else { e_hlen = ETHER_HDR_LEN; proto = ntohs(eh->evl_encap_proto); } switch (proto) { case ETHERTYPE_IP: /* get the IP header, if mbuf len < 20 then header in next mbuf */ ip4 = (m->m_len < sizeof(struct ip)) ? (struct ip *)m->m_next->m_data : (struct ip *)(m->m_data + e_hlen); /* ip_hl is number of 32-bit words */ ip_hlen = (ip4->ip_hl << 2); ip = (caddr_t)ip4; break; case ETHERTYPE_IPV6: /* get the IPv6 header, if mbuf len < 40 then header in next mbuf */ ip6 = (m->m_len < sizeof(struct ip6_hdr)) ? (struct ip6_hdr *)m->m_next->m_data : (struct ip6_hdr *)(m->m_data + e_hlen); /* XXX cannot support offload with IPv6 extensions */ ip_hlen = sizeof(struct ip6_hdr); ip = (caddr_t)ip6; break; default: /* We can't offload in this case... */ /* XXX error stat ??? */ return (0); } /* XXX assuming L4 header is contiguous to IPv4/IPv6 in the same mbuf */ l4_off = (e_hlen + ip_hlen); *parsing_data |= (((l4_off >> 1) << ETH_TX_PARSE_BD_E2_L4_HDR_START_OFFSET_W_SHIFT) & ETH_TX_PARSE_BD_E2_L4_HDR_START_OFFSET_W); if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TSO | CSUM_TCP_IPV6)) { fp->eth_q_stats.tx_ofld_frames_csum_tcp++; th = (struct tcphdr *)(ip + ip_hlen); /* th_off is number of 32-bit words */ *parsing_data |= ((th->th_off << ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW_SHIFT) & ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW); return (l4_off + (th->th_off << 2)); /* entire header length */ } else if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6)) { fp->eth_q_stats.tx_ofld_frames_csum_udp++; return (l4_off + sizeof(struct udphdr)); /* entire header length */ } else { /* XXX error stat ??? */ return (0); } } static uint8_t bxe_set_pbd_csum(struct bxe_fastpath *fp, struct mbuf *m, struct eth_tx_parse_bd_e1x *pbd) { struct ether_vlan_header *eh = NULL; struct ip *ip4 = NULL; struct ip6_hdr *ip6 = NULL; caddr_t ip = NULL; struct tcphdr *th = NULL; struct udphdr *uh = NULL; int e_hlen, ip_hlen; uint16_t proto; uint8_t hlen; uint16_t tmp_csum; uint32_t *tmp_uh; /* get the Ethernet header */ eh = mtod(m, struct ether_vlan_header *); /* handle VLAN encapsulation if present */ if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { e_hlen = (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); proto = ntohs(eh->evl_proto); } else { e_hlen = ETHER_HDR_LEN; proto = ntohs(eh->evl_encap_proto); } switch (proto) { case ETHERTYPE_IP: /* get the IP header, if mbuf len < 20 then header in next mbuf */ ip4 = (m->m_len < sizeof(struct ip)) ? (struct ip *)m->m_next->m_data : (struct ip *)(m->m_data + e_hlen); /* ip_hl is number of 32-bit words */ ip_hlen = (ip4->ip_hl << 1); ip = (caddr_t)ip4; break; case ETHERTYPE_IPV6: /* get the IPv6 header, if mbuf len < 40 then header in next mbuf */ ip6 = (m->m_len < sizeof(struct ip6_hdr)) ? (struct ip6_hdr *)m->m_next->m_data : (struct ip6_hdr *)(m->m_data + e_hlen); /* XXX cannot support offload with IPv6 extensions */ ip_hlen = (sizeof(struct ip6_hdr) >> 1); ip = (caddr_t)ip6; break; default: /* We can't offload in this case... */ /* XXX error stat ??? */ return (0); } hlen = (e_hlen >> 1); /* note that rest of global_data is indirectly zeroed here */ if (m->m_flags & M_VLANTAG) { pbd->global_data = htole16(hlen | (1 << ETH_TX_PARSE_BD_E1X_LLC_SNAP_EN_SHIFT)); } else { pbd->global_data = htole16(hlen); } pbd->ip_hlen_w = ip_hlen; hlen += pbd->ip_hlen_w; /* XXX assuming L4 header is contiguous to IPv4/IPv6 in the same mbuf */ if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TSO | CSUM_TCP_IPV6)) { th = (struct tcphdr *)(ip + (ip_hlen << 1)); /* th_off is number of 32-bit words */ hlen += (uint16_t)(th->th_off << 1); } else if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6)) { uh = (struct udphdr *)(ip + (ip_hlen << 1)); hlen += (sizeof(struct udphdr) / 2); } else { /* valid case as only CSUM_IP was set */ return (0); } pbd->total_hlen_w = htole16(hlen); if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TSO | CSUM_TCP_IPV6)) { fp->eth_q_stats.tx_ofld_frames_csum_tcp++; pbd->tcp_pseudo_csum = ntohs(th->th_sum); } else if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6)) { fp->eth_q_stats.tx_ofld_frames_csum_udp++; /* * Everest1 (i.e. 57710, 57711, 57711E) does not natively support UDP * checksums and does not know anything about the UDP header and where * the checksum field is located. It only knows about TCP. Therefore * we "lie" to the hardware for outgoing UDP packets w/ checksum * offload. Since the checksum field offset for TCP is 16 bytes and * for UDP it is 6 bytes we pass a pointer to the hardware that is 10 * bytes less than the start of the UDP header. This allows the * hardware to write the checksum in the correct spot. But the * hardware will compute a checksum which includes the last 10 bytes * of the IP header. To correct this we tweak the stack computed * pseudo checksum by folding in the calculation of the inverse * checksum for those final 10 bytes of the IP header. This allows * the correct checksum to be computed by the hardware. */ /* set pointer 10 bytes before UDP header */ tmp_uh = (uint32_t *)((uint8_t *)uh - 10); /* calculate a pseudo header checksum over the first 10 bytes */ tmp_csum = in_pseudo(*tmp_uh, *(tmp_uh + 1), *(uint16_t *)(tmp_uh + 2)); pbd->tcp_pseudo_csum = ntohs(in_addword(uh->uh_sum, ~tmp_csum)); } return (hlen * 2); /* entire header length, number of bytes */ } static void bxe_set_pbd_lso_e2(struct mbuf *m, uint32_t *parsing_data) { *parsing_data |= ((m->m_pkthdr.tso_segsz << ETH_TX_PARSE_BD_E2_LSO_MSS_SHIFT) & ETH_TX_PARSE_BD_E2_LSO_MSS); /* XXX test for IPv6 with extension header... */ #if 0 struct ip6_hdr *ip6; if (ip6 && ip6->ip6_nxt == 'some ipv6 extension header') *parsing_data |= ETH_TX_PARSE_BD_E2_IPV6_WITH_EXT_HDR; #endif } static void bxe_set_pbd_lso(struct mbuf *m, struct eth_tx_parse_bd_e1x *pbd) { struct ether_vlan_header *eh = NULL; struct ip *ip = NULL; struct tcphdr *th = NULL; int e_hlen; /* get the Ethernet header */ eh = mtod(m, struct ether_vlan_header *); /* handle VLAN encapsulation if present */ e_hlen = (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) ? (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) : ETHER_HDR_LEN; /* get the IP and TCP header, with LSO entire header in first mbuf */ /* XXX assuming IPv4 */ ip = (struct ip *)(m->m_data + e_hlen); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); pbd->lso_mss = htole16(m->m_pkthdr.tso_segsz); pbd->tcp_send_seq = ntohl(th->th_seq); pbd->tcp_flags = ((ntohl(((uint32_t *)th)[3]) >> 16) & 0xff); #if 1 /* XXX IPv4 */ pbd->ip_id = ntohs(ip->ip_id); pbd->tcp_pseudo_csum = ntohs(in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP))); #else /* XXX IPv6 */ pbd->tcp_pseudo_csum = ntohs(in_pseudo(&ip6->ip6_src, &ip6->ip6_dst, htons(IPPROTO_TCP))); #endif pbd->global_data |= htole16(ETH_TX_PARSE_BD_E1X_PSEUDO_CS_WITHOUT_LEN); } /* * Encapsulte an mbuf cluster into the tx bd chain and makes the memory * visible to the controller. * * If an mbuf is submitted to this routine and cannot be given to the * controller (e.g. it has too many fragments) then the function may free * the mbuf and return to the caller. * * Returns: * 0 = Success, !0 = Failure * Note the side effect that an mbuf may be freed if it causes a problem. */ static int bxe_tx_encap(struct bxe_fastpath *fp, struct mbuf **m_head) { bus_dma_segment_t segs[32]; struct mbuf *m0; struct bxe_sw_tx_bd *tx_buf; struct eth_tx_parse_bd_e1x *pbd_e1x = NULL; struct eth_tx_parse_bd_e2 *pbd_e2 = NULL; /* struct eth_tx_parse_2nd_bd *pbd2 = NULL; */ struct eth_tx_bd *tx_data_bd; struct eth_tx_bd *tx_total_pkt_size_bd; struct eth_tx_start_bd *tx_start_bd; uint16_t bd_prod, pkt_prod, total_pkt_size; uint8_t mac_type; int defragged, error, nsegs, rc, nbds, vlan_off, ovlan; struct bxe_softc *sc; uint16_t tx_bd_avail; struct ether_vlan_header *eh; uint32_t pbd_e2_parsing_data = 0; uint8_t hlen = 0; int tmp_bd; int i; sc = fp->sc; M_ASSERTPKTHDR(*m_head); m0 = *m_head; rc = defragged = nbds = ovlan = vlan_off = total_pkt_size = 0; tx_start_bd = NULL; tx_data_bd = NULL; tx_total_pkt_size_bd = NULL; /* get the H/W pointer for packets and BDs */ pkt_prod = fp->tx_pkt_prod; bd_prod = fp->tx_bd_prod; mac_type = UNICAST_ADDRESS; /* map the mbuf into the next open DMAable memory */ tx_buf = &fp->tx_mbuf_chain[TX_BD(pkt_prod)]; error = bus_dmamap_load_mbuf_sg(fp->tx_mbuf_tag, tx_buf->m_map, m0, segs, &nsegs, BUS_DMA_NOWAIT); /* mapping errors */ if(__predict_false(error != 0)) { fp->eth_q_stats.tx_dma_mapping_failure++; if (error == ENOMEM) { /* resource issue, try again later */ rc = ENOMEM; } else if (error == EFBIG) { /* possibly recoverable with defragmentation */ fp->eth_q_stats.mbuf_defrag_attempts++; m0 = m_defrag(*m_head, M_NOWAIT); if (m0 == NULL) { fp->eth_q_stats.mbuf_defrag_failures++; rc = ENOBUFS; } else { /* defrag successful, try mapping again */ *m_head = m0; error = bus_dmamap_load_mbuf_sg(fp->tx_mbuf_tag, tx_buf->m_map, m0, segs, &nsegs, BUS_DMA_NOWAIT); if (error) { fp->eth_q_stats.tx_dma_mapping_failure++; rc = error; } } } else { /* unknown, unrecoverable mapping error */ BLOGE(sc, "Unknown TX mapping error rc=%d\n", error); bxe_dump_mbuf(sc, m0, FALSE); rc = error; } goto bxe_tx_encap_continue; } tx_bd_avail = bxe_tx_avail(sc, fp); /* make sure there is enough room in the send queue */ if (__predict_false(tx_bd_avail < (nsegs + 2))) { /* Recoverable, try again later. */ fp->eth_q_stats.tx_hw_queue_full++; bus_dmamap_unload(fp->tx_mbuf_tag, tx_buf->m_map); rc = ENOMEM; goto bxe_tx_encap_continue; } /* capture the current H/W TX chain high watermark */ if (__predict_false(fp->eth_q_stats.tx_hw_max_queue_depth < (TX_BD_USABLE - tx_bd_avail))) { fp->eth_q_stats.tx_hw_max_queue_depth = (TX_BD_USABLE - tx_bd_avail); } /* make sure it fits in the packet window */ if (__predict_false(nsegs > BXE_MAX_SEGMENTS)) { /* * The mbuf may be to big for the controller to handle. If the frame * is a TSO frame we'll need to do an additional check. */ if (m0->m_pkthdr.csum_flags & CSUM_TSO) { if (bxe_chktso_window(sc, nsegs, segs, m0) == 0) { goto bxe_tx_encap_continue; /* OK to send */ } else { fp->eth_q_stats.tx_window_violation_tso++; } } else { fp->eth_q_stats.tx_window_violation_std++; } /* lets try to defragment this mbuf and remap it */ fp->eth_q_stats.mbuf_defrag_attempts++; bus_dmamap_unload(fp->tx_mbuf_tag, tx_buf->m_map); m0 = m_defrag(*m_head, M_NOWAIT); if (m0 == NULL) { fp->eth_q_stats.mbuf_defrag_failures++; /* Ugh, just drop the frame... :( */ rc = ENOBUFS; } else { /* defrag successful, try mapping again */ *m_head = m0; error = bus_dmamap_load_mbuf_sg(fp->tx_mbuf_tag, tx_buf->m_map, m0, segs, &nsegs, BUS_DMA_NOWAIT); if (error) { fp->eth_q_stats.tx_dma_mapping_failure++; /* No sense in trying to defrag/copy chain, drop it. :( */ rc = error; } else { /* if the chain is still too long then drop it */ if (__predict_false(nsegs > BXE_MAX_SEGMENTS)) { bus_dmamap_unload(fp->tx_mbuf_tag, tx_buf->m_map); rc = ENODEV; } } } } bxe_tx_encap_continue: /* Check for errors */ if (rc) { if (rc == ENOMEM) { /* recoverable try again later */ } else { fp->eth_q_stats.tx_soft_errors++; fp->eth_q_stats.mbuf_alloc_tx--; m_freem(*m_head); *m_head = NULL; } return (rc); } /* set flag according to packet type (UNICAST_ADDRESS is default) */ if (m0->m_flags & M_BCAST) { mac_type = BROADCAST_ADDRESS; } else if (m0->m_flags & M_MCAST) { mac_type = MULTICAST_ADDRESS; } /* store the mbuf into the mbuf ring */ tx_buf->m = m0; tx_buf->first_bd = fp->tx_bd_prod; tx_buf->flags = 0; /* prepare the first transmit (start) BD for the mbuf */ tx_start_bd = &fp->tx_chain[TX_BD(bd_prod)].start_bd; BLOGD(sc, DBG_TX, "sending pkt_prod=%u tx_buf=%p next_idx=%u bd=%u tx_start_bd=%p\n", pkt_prod, tx_buf, fp->tx_pkt_prod, bd_prod, tx_start_bd); tx_start_bd->addr_lo = htole32(U64_LO(segs[0].ds_addr)); tx_start_bd->addr_hi = htole32(U64_HI(segs[0].ds_addr)); tx_start_bd->nbytes = htole16(segs[0].ds_len); total_pkt_size += tx_start_bd->nbytes; tx_start_bd->bd_flags.as_bitfield = ETH_TX_BD_FLAGS_START_BD; tx_start_bd->general_data = (1 << ETH_TX_START_BD_HDR_NBDS_SHIFT); /* all frames have at least Start BD + Parsing BD */ nbds = nsegs + 1; tx_start_bd->nbd = htole16(nbds); if (m0->m_flags & M_VLANTAG) { tx_start_bd->vlan_or_ethertype = htole16(m0->m_pkthdr.ether_vtag); tx_start_bd->bd_flags.as_bitfield |= (X_ETH_OUTBAND_VLAN << ETH_TX_BD_FLAGS_VLAN_MODE_SHIFT); } else { /* vf tx, start bd must hold the ethertype for fw to enforce it */ if (IS_VF(sc)) { /* map ethernet header to find type and header length */ eh = mtod(m0, struct ether_vlan_header *); tx_start_bd->vlan_or_ethertype = eh->evl_encap_proto; } else { /* used by FW for packet accounting */ tx_start_bd->vlan_or_ethertype = htole16(fp->tx_pkt_prod); #if 0 /* * If NPAR-SD is active then FW should do the tagging regardless * of value of priority. Otherwise, if priority indicates this is * a control packet we need to indicate to FW to avoid tagging. */ if (!IS_MF_AFEX(sc) && (mbuf priority == PRIO_CONTROL)) { SET_FLAG(tx_start_bd->general_data, ETH_TX_START_BD_FORCE_VLAN_MODE, 1); } #endif } } /* * add a parsing BD from the chain. The parsing BD is always added * though it is only used for TSO and chksum */ bd_prod = TX_BD_NEXT(bd_prod); if (m0->m_pkthdr.csum_flags) { if (m0->m_pkthdr.csum_flags & CSUM_IP) { fp->eth_q_stats.tx_ofld_frames_csum_ip++; tx_start_bd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_IP_CSUM; } if (m0->m_pkthdr.csum_flags & CSUM_TCP_IPV6) { tx_start_bd->bd_flags.as_bitfield |= (ETH_TX_BD_FLAGS_IPV6 | ETH_TX_BD_FLAGS_L4_CSUM); } else if (m0->m_pkthdr.csum_flags & CSUM_UDP_IPV6) { tx_start_bd->bd_flags.as_bitfield |= (ETH_TX_BD_FLAGS_IPV6 | ETH_TX_BD_FLAGS_IS_UDP | ETH_TX_BD_FLAGS_L4_CSUM); } else if ((m0->m_pkthdr.csum_flags & CSUM_TCP) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { tx_start_bd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_L4_CSUM; } else if (m0->m_pkthdr.csum_flags & CSUM_UDP) { tx_start_bd->bd_flags.as_bitfield |= (ETH_TX_BD_FLAGS_L4_CSUM | ETH_TX_BD_FLAGS_IS_UDP); } } if (!CHIP_IS_E1x(sc)) { pbd_e2 = &fp->tx_chain[TX_BD(bd_prod)].parse_bd_e2; memset(pbd_e2, 0, sizeof(struct eth_tx_parse_bd_e2)); if (m0->m_pkthdr.csum_flags) { hlen = bxe_set_pbd_csum_e2(fp, m0, &pbd_e2_parsing_data); } #if 0 /* * Add the MACs to the parsing BD if the module param was * explicitly set, if this is a vf, or in switch independent * mode. */ if (sc->flags & BXE_TX_SWITCHING || IS_VF(sc) || IS_MF_SI(sc)) { eh = mtod(m0, struct ether_vlan_header *); bxe_set_fw_mac_addr(&pbd_e2->data.mac_addr.src_hi, &pbd_e2->data.mac_addr.src_mid, &pbd_e2->data.mac_addr.src_lo, eh->evl_shost); bxe_set_fw_mac_addr(&pbd_e2->data.mac_addr.dst_hi, &pbd_e2->data.mac_addr.dst_mid, &pbd_e2->data.mac_addr.dst_lo, eh->evl_dhost); } #endif SET_FLAG(pbd_e2_parsing_data, ETH_TX_PARSE_BD_E2_ETH_ADDR_TYPE, mac_type); } else { uint16_t global_data = 0; pbd_e1x = &fp->tx_chain[TX_BD(bd_prod)].parse_bd_e1x; memset(pbd_e1x, 0, sizeof(struct eth_tx_parse_bd_e1x)); if (m0->m_pkthdr.csum_flags) { hlen = bxe_set_pbd_csum(fp, m0, pbd_e1x); } SET_FLAG(global_data, ETH_TX_PARSE_BD_E1X_ETH_ADDR_TYPE, mac_type); pbd_e1x->global_data |= htole16(global_data); } /* setup the parsing BD with TSO specific info */ if (m0->m_pkthdr.csum_flags & CSUM_TSO) { fp->eth_q_stats.tx_ofld_frames_lso++; tx_start_bd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_SW_LSO; if (__predict_false(tx_start_bd->nbytes > hlen)) { fp->eth_q_stats.tx_ofld_frames_lso_hdr_splits++; /* split the first BD into header/data making the fw job easy */ nbds++; tx_start_bd->nbd = htole16(nbds); tx_start_bd->nbytes = htole16(hlen); bd_prod = TX_BD_NEXT(bd_prod); /* new transmit BD after the tx_parse_bd */ tx_data_bd = &fp->tx_chain[TX_BD(bd_prod)].reg_bd; tx_data_bd->addr_hi = htole32(U64_HI(segs[0].ds_addr + hlen)); tx_data_bd->addr_lo = htole32(U64_LO(segs[0].ds_addr + hlen)); tx_data_bd->nbytes = htole16(segs[0].ds_len - hlen); if (tx_total_pkt_size_bd == NULL) { tx_total_pkt_size_bd = tx_data_bd; } BLOGD(sc, DBG_TX, "TSO split header size is %d (%x:%x) nbds %d\n", le16toh(tx_start_bd->nbytes), le32toh(tx_start_bd->addr_hi), le32toh(tx_start_bd->addr_lo), nbds); } if (!CHIP_IS_E1x(sc)) { bxe_set_pbd_lso_e2(m0, &pbd_e2_parsing_data); } else { bxe_set_pbd_lso(m0, pbd_e1x); } } if (pbd_e2_parsing_data) { pbd_e2->parsing_data = htole32(pbd_e2_parsing_data); } /* prepare remaining BDs, start tx bd contains first seg/frag */ for (i = 1; i < nsegs ; i++) { bd_prod = TX_BD_NEXT(bd_prod); tx_data_bd = &fp->tx_chain[TX_BD(bd_prod)].reg_bd; tx_data_bd->addr_lo = htole32(U64_LO(segs[i].ds_addr)); tx_data_bd->addr_hi = htole32(U64_HI(segs[i].ds_addr)); tx_data_bd->nbytes = htole16(segs[i].ds_len); if (tx_total_pkt_size_bd == NULL) { tx_total_pkt_size_bd = tx_data_bd; } total_pkt_size += tx_data_bd->nbytes; } BLOGD(sc, DBG_TX, "last bd %p\n", tx_data_bd); if (tx_total_pkt_size_bd != NULL) { tx_total_pkt_size_bd->total_pkt_bytes = total_pkt_size; } if (__predict_false(sc->debug & DBG_TX)) { tmp_bd = tx_buf->first_bd; for (i = 0; i < nbds; i++) { if (i == 0) { BLOGD(sc, DBG_TX, "TX Strt: %p bd=%d nbd=%d vlan=0x%x " "bd_flags=0x%x hdr_nbds=%d\n", tx_start_bd, tmp_bd, le16toh(tx_start_bd->nbd), le16toh(tx_start_bd->vlan_or_ethertype), tx_start_bd->bd_flags.as_bitfield, (tx_start_bd->general_data & ETH_TX_START_BD_HDR_NBDS)); } else if (i == 1) { if (pbd_e1x) { BLOGD(sc, DBG_TX, "-> Prse: %p bd=%d global=0x%x ip_hlen_w=%u " "ip_id=%u lso_mss=%u tcp_flags=0x%x csum=0x%x " "tcp_seq=%u total_hlen_w=%u\n", pbd_e1x, tmp_bd, pbd_e1x->global_data, pbd_e1x->ip_hlen_w, pbd_e1x->ip_id, pbd_e1x->lso_mss, pbd_e1x->tcp_flags, pbd_e1x->tcp_pseudo_csum, pbd_e1x->tcp_send_seq, le16toh(pbd_e1x->total_hlen_w)); } else { /* if (pbd_e2) */ BLOGD(sc, DBG_TX, "-> Parse: %p bd=%d dst=%02x:%02x:%02x " "src=%02x:%02x:%02x parsing_data=0x%x\n", pbd_e2, tmp_bd, pbd_e2->data.mac_addr.dst_hi, pbd_e2->data.mac_addr.dst_mid, pbd_e2->data.mac_addr.dst_lo, pbd_e2->data.mac_addr.src_hi, pbd_e2->data.mac_addr.src_mid, pbd_e2->data.mac_addr.src_lo, pbd_e2->parsing_data); } } if (i != 1) { /* skip parse db as it doesn't hold data */ tx_data_bd = &fp->tx_chain[TX_BD(tmp_bd)].reg_bd; BLOGD(sc, DBG_TX, "-> Frag: %p bd=%d nbytes=%d hi=0x%x lo: 0x%x\n", tx_data_bd, tmp_bd, le16toh(tx_data_bd->nbytes), le32toh(tx_data_bd->addr_hi), le32toh(tx_data_bd->addr_lo)); } tmp_bd = TX_BD_NEXT(tmp_bd); } } BLOGD(sc, DBG_TX, "doorbell: nbds=%d bd=%u\n", nbds, bd_prod); /* update TX BD producer index value for next TX */ bd_prod = TX_BD_NEXT(bd_prod); /* * If the chain of tx_bd's describing this frame is adjacent to or spans * an eth_tx_next_bd element then we need to increment the nbds value. */ if (TX_BD_IDX(bd_prod) < nbds) { nbds++; } /* don't allow reordering of writes for nbd and packets */ mb(); fp->tx_db.data.prod += nbds; /* producer points to the next free tx_bd at this point */ fp->tx_pkt_prod++; fp->tx_bd_prod = bd_prod; DOORBELL(sc, fp->index, fp->tx_db.raw); fp->eth_q_stats.tx_pkts++; /* Prevent speculative reads from getting ahead of the status block. */ bus_space_barrier(sc->bar[BAR0].tag, sc->bar[BAR0].handle, 0, 0, BUS_SPACE_BARRIER_READ); /* Prevent speculative reads from getting ahead of the doorbell. */ bus_space_barrier(sc->bar[BAR2].tag, sc->bar[BAR2].handle, 0, 0, BUS_SPACE_BARRIER_READ); return (0); } static void bxe_tx_start_locked(struct bxe_softc *sc, if_t ifp, struct bxe_fastpath *fp) { struct mbuf *m = NULL; int tx_count = 0; uint16_t tx_bd_avail; BXE_FP_TX_LOCK_ASSERT(fp); /* keep adding entries while there are frames to send */ while (!if_sendq_empty(ifp)) { /* * check for any frames to send * dequeue can still be NULL even if queue is not empty */ m = if_dequeue(ifp); if (__predict_false(m == NULL)) { break; } /* the mbuf now belongs to us */ fp->eth_q_stats.mbuf_alloc_tx++; /* * Put the frame into the transmit ring. If we don't have room, * place the mbuf back at the head of the TX queue, set the * OACTIVE flag, and wait for the NIC to drain the chain. */ if (__predict_false(bxe_tx_encap(fp, &m))) { fp->eth_q_stats.tx_encap_failures++; if (m != NULL) { /* mark the TX queue as full and return the frame */ if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); if_sendq_prepend(ifp, m); fp->eth_q_stats.mbuf_alloc_tx--; fp->eth_q_stats.tx_queue_xoff++; } /* stop looking for more work */ break; } /* the frame was enqueued successfully */ tx_count++; /* send a copy of the frame to any BPF listeners. */ if_etherbpfmtap(ifp, m); tx_bd_avail = bxe_tx_avail(sc, fp); /* handle any completions if we're running low */ if (tx_bd_avail < BXE_TX_CLEANUP_THRESHOLD) { /* bxe_txeof will set IFF_DRV_OACTIVE appropriately */ bxe_txeof(sc, fp); if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE) { break; } } } /* all TX packets were dequeued and/or the tx ring is full */ if (tx_count > 0) { /* reset the TX watchdog timeout timer */ fp->watchdog_timer = BXE_TX_TIMEOUT; } } /* Legacy (non-RSS) dispatch routine */ static void bxe_tx_start(if_t ifp) { struct bxe_softc *sc; struct bxe_fastpath *fp; sc = if_getsoftc(ifp); if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { BLOGW(sc, "Interface not running, ignoring transmit request\n"); return; } if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE) { BLOGW(sc, "Interface TX queue is full, ignoring transmit request\n"); return; } if (!sc->link_vars.link_up) { BLOGW(sc, "Interface link is down, ignoring transmit request\n"); return; } fp = &sc->fp[0]; BXE_FP_TX_LOCK(fp); bxe_tx_start_locked(sc, ifp, fp); BXE_FP_TX_UNLOCK(fp); } #if __FreeBSD_version >= 800000 static int bxe_tx_mq_start_locked(struct bxe_softc *sc, if_t ifp, struct bxe_fastpath *fp, struct mbuf *m) { struct buf_ring *tx_br = fp->tx_br; struct mbuf *next; int depth, rc, tx_count; uint16_t tx_bd_avail; rc = tx_count = 0; if (!tx_br) { BLOGE(sc, "Multiqueue TX and no buf_ring!\n"); return (EINVAL); } /* fetch the depth of the driver queue */ depth = drbr_inuse_drv(ifp, tx_br); if (depth > fp->eth_q_stats.tx_max_drbr_queue_depth) { fp->eth_q_stats.tx_max_drbr_queue_depth = depth; } BXE_FP_TX_LOCK_ASSERT(fp); if (m == NULL) { /* no new work, check for pending frames */ next = drbr_dequeue_drv(ifp, tx_br); } else if (drbr_needs_enqueue_drv(ifp, tx_br)) { /* have both new and pending work, maintain packet order */ rc = drbr_enqueue_drv(ifp, tx_br, m); if (rc != 0) { fp->eth_q_stats.tx_soft_errors++; goto bxe_tx_mq_start_locked_exit; } next = drbr_dequeue_drv(ifp, tx_br); } else { /* new work only and nothing pending */ next = m; } /* keep adding entries while there are frames to send */ while (next != NULL) { /* the mbuf now belongs to us */ fp->eth_q_stats.mbuf_alloc_tx++; /* * Put the frame into the transmit ring. If we don't have room, * place the mbuf back at the head of the TX queue, set the * OACTIVE flag, and wait for the NIC to drain the chain. */ rc = bxe_tx_encap(fp, &next); if (__predict_false(rc != 0)) { fp->eth_q_stats.tx_encap_failures++; if (next != NULL) { /* mark the TX queue as full and save the frame */ if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); /* XXX this may reorder the frame */ rc = drbr_enqueue_drv(ifp, tx_br, next); fp->eth_q_stats.mbuf_alloc_tx--; fp->eth_q_stats.tx_frames_deferred++; } /* stop looking for more work */ break; } /* the transmit frame was enqueued successfully */ tx_count++; /* send a copy of the frame to any BPF listeners */ if_etherbpfmtap(ifp, next); tx_bd_avail = bxe_tx_avail(sc, fp); /* handle any completions if we're running low */ if (tx_bd_avail < BXE_TX_CLEANUP_THRESHOLD) { /* bxe_txeof will set IFF_DRV_OACTIVE appropriately */ bxe_txeof(sc, fp); if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE) { break; } } next = drbr_dequeue_drv(ifp, tx_br); } /* all TX packets were dequeued and/or the tx ring is full */ if (tx_count > 0) { /* reset the TX watchdog timeout timer */ fp->watchdog_timer = BXE_TX_TIMEOUT; } bxe_tx_mq_start_locked_exit: return (rc); } /* Multiqueue (TSS) dispatch routine. */ static int bxe_tx_mq_start(struct ifnet *ifp, struct mbuf *m) { struct bxe_softc *sc = if_getsoftc(ifp); struct bxe_fastpath *fp; int fp_index, rc; fp_index = 0; /* default is the first queue */ - /* change the queue if using flow ID */ - if ((m->m_flags & M_FLOWID) != 0) { + /* check if flowid is set */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) fp_index = (m->m_pkthdr.flowid % sc->num_queues); - } fp = &sc->fp[fp_index]; if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { BLOGW(sc, "Interface not running, ignoring transmit request\n"); return (ENETDOWN); } if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE) { BLOGW(sc, "Interface TX queue is full, ignoring transmit request\n"); return (EBUSY); } if (!sc->link_vars.link_up) { BLOGW(sc, "Interface link is down, ignoring transmit request\n"); return (ENETDOWN); } /* XXX change to TRYLOCK here and if failed then schedule taskqueue */ BXE_FP_TX_LOCK(fp); rc = bxe_tx_mq_start_locked(sc, ifp, fp, m); BXE_FP_TX_UNLOCK(fp); return (rc); } static void bxe_mq_flush(struct ifnet *ifp) { struct bxe_softc *sc = if_getsoftc(ifp); struct bxe_fastpath *fp; struct mbuf *m; int i; for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; if (fp->state != BXE_FP_STATE_OPEN) { BLOGD(sc, DBG_LOAD, "Not clearing fp[%02d] buf_ring (state=%d)\n", fp->index, fp->state); continue; } if (fp->tx_br != NULL) { BLOGD(sc, DBG_LOAD, "Clearing fp[%02d] buf_ring\n", fp->index); BXE_FP_TX_LOCK(fp); while ((m = buf_ring_dequeue_sc(fp->tx_br)) != NULL) { m_freem(m); } BXE_FP_TX_UNLOCK(fp); } } if_qflush(ifp); } #endif /* FreeBSD_version >= 800000 */ static uint16_t bxe_cid_ilt_lines(struct bxe_softc *sc) { if (IS_SRIOV(sc)) { return ((BXE_FIRST_VF_CID + BXE_VF_CIDS) / ILT_PAGE_CIDS); } return (L2_ILT_LINES(sc)); } static void bxe_ilt_set_info(struct bxe_softc *sc) { struct ilt_client_info *ilt_client; struct ecore_ilt *ilt = sc->ilt; uint16_t line = 0; ilt->start_line = FUNC_ILT_BASE(SC_FUNC(sc)); BLOGD(sc, DBG_LOAD, "ilt starts at line %d\n", ilt->start_line); /* CDU */ ilt_client = &ilt->clients[ILT_CLIENT_CDU]; ilt_client->client_num = ILT_CLIENT_CDU; ilt_client->page_size = CDU_ILT_PAGE_SZ; ilt_client->flags = ILT_CLIENT_SKIP_MEM; ilt_client->start = line; line += bxe_cid_ilt_lines(sc); if (CNIC_SUPPORT(sc)) { line += CNIC_ILT_LINES; } ilt_client->end = (line - 1); BLOGD(sc, DBG_LOAD, "ilt client[CDU]: start %d, end %d, " "psz 0x%x, flags 0x%x, hw psz %d\n", ilt_client->start, ilt_client->end, ilt_client->page_size, ilt_client->flags, ilog2(ilt_client->page_size >> 12)); /* QM */ if (QM_INIT(sc->qm_cid_count)) { ilt_client = &ilt->clients[ILT_CLIENT_QM]; ilt_client->client_num = ILT_CLIENT_QM; ilt_client->page_size = QM_ILT_PAGE_SZ; ilt_client->flags = 0; ilt_client->start = line; /* 4 bytes for each cid */ line += DIV_ROUND_UP(sc->qm_cid_count * QM_QUEUES_PER_FUNC * 4, QM_ILT_PAGE_SZ); ilt_client->end = (line - 1); BLOGD(sc, DBG_LOAD, "ilt client[QM]: start %d, end %d, " "psz 0x%x, flags 0x%x, hw psz %d\n", ilt_client->start, ilt_client->end, ilt_client->page_size, ilt_client->flags, ilog2(ilt_client->page_size >> 12)); } if (CNIC_SUPPORT(sc)) { /* SRC */ ilt_client = &ilt->clients[ILT_CLIENT_SRC]; ilt_client->client_num = ILT_CLIENT_SRC; ilt_client->page_size = SRC_ILT_PAGE_SZ; ilt_client->flags = 0; ilt_client->start = line; line += SRC_ILT_LINES; ilt_client->end = (line - 1); BLOGD(sc, DBG_LOAD, "ilt client[SRC]: start %d, end %d, " "psz 0x%x, flags 0x%x, hw psz %d\n", ilt_client->start, ilt_client->end, ilt_client->page_size, ilt_client->flags, ilog2(ilt_client->page_size >> 12)); /* TM */ ilt_client = &ilt->clients[ILT_CLIENT_TM]; ilt_client->client_num = ILT_CLIENT_TM; ilt_client->page_size = TM_ILT_PAGE_SZ; ilt_client->flags = 0; ilt_client->start = line; line += TM_ILT_LINES; ilt_client->end = (line - 1); BLOGD(sc, DBG_LOAD, "ilt client[TM]: start %d, end %d, " "psz 0x%x, flags 0x%x, hw psz %d\n", ilt_client->start, ilt_client->end, ilt_client->page_size, ilt_client->flags, ilog2(ilt_client->page_size >> 12)); } KASSERT((line <= ILT_MAX_LINES), ("Invalid number of ILT lines!")); } static void bxe_set_fp_rx_buf_size(struct bxe_softc *sc) { int i; BLOGD(sc, DBG_LOAD, "mtu = %d\n", sc->mtu); for (i = 0; i < sc->num_queues; i++) { /* get the Rx buffer size for RX frames */ sc->fp[i].rx_buf_size = (IP_HEADER_ALIGNMENT_PADDING + ETH_OVERHEAD + sc->mtu); BLOGD(sc, DBG_LOAD, "rx_buf_size for fp[%02d] = %d\n", i, sc->fp[i].rx_buf_size); /* get the mbuf allocation size for RX frames */ if (sc->fp[i].rx_buf_size <= MCLBYTES) { sc->fp[i].mbuf_alloc_size = MCLBYTES; } else if (sc->fp[i].rx_buf_size <= BCM_PAGE_SIZE) { sc->fp[i].mbuf_alloc_size = PAGE_SIZE; } else { sc->fp[i].mbuf_alloc_size = MJUM9BYTES; } BLOGD(sc, DBG_LOAD, "mbuf_alloc_size for fp[%02d] = %d\n", i, sc->fp[i].mbuf_alloc_size); } } static int bxe_alloc_ilt_mem(struct bxe_softc *sc) { int rc = 0; if ((sc->ilt = (struct ecore_ilt *)malloc(sizeof(struct ecore_ilt), M_BXE_ILT, (M_NOWAIT | M_ZERO))) == NULL) { rc = 1; } return (rc); } static int bxe_alloc_ilt_lines_mem(struct bxe_softc *sc) { int rc = 0; if ((sc->ilt->lines = (struct ilt_line *)malloc((sizeof(struct ilt_line) * ILT_MAX_LINES), M_BXE_ILT, (M_NOWAIT | M_ZERO))) == NULL) { rc = 1; } return (rc); } static void bxe_free_ilt_mem(struct bxe_softc *sc) { if (sc->ilt != NULL) { free(sc->ilt, M_BXE_ILT); sc->ilt = NULL; } } static void bxe_free_ilt_lines_mem(struct bxe_softc *sc) { if (sc->ilt->lines != NULL) { free(sc->ilt->lines, M_BXE_ILT); sc->ilt->lines = NULL; } } static void bxe_free_mem(struct bxe_softc *sc) { int i; #if 0 if (!CONFIGURE_NIC_MODE(sc)) { /* free searcher T2 table */ bxe_dma_free(sc, &sc->t2); } #endif for (i = 0; i < L2_ILT_LINES(sc); i++) { bxe_dma_free(sc, &sc->context[i].vcxt_dma); sc->context[i].vcxt = NULL; sc->context[i].size = 0; } ecore_ilt_mem_op(sc, ILT_MEMOP_FREE); bxe_free_ilt_lines_mem(sc); #if 0 bxe_iov_free_mem(sc); #endif } static int bxe_alloc_mem(struct bxe_softc *sc) { int context_size; int allocated; int i; #if 0 if (!CONFIGURE_NIC_MODE(sc)) { /* allocate searcher T2 table */ if (bxe_dma_alloc(sc, SRC_T2_SZ, &sc->t2, "searcher t2 table") != 0) { return (-1); } } #endif /* * Allocate memory for CDU context: * This memory is allocated separately and not in the generic ILT * functions because CDU differs in few aspects: * 1. There can be multiple entities allocating memory for context - * regular L2, CNIC, and SRIOV drivers. Each separately controls * its own ILT lines. * 2. Since CDU page-size is not a single 4KB page (which is the case * for the other ILT clients), to be efficient we want to support * allocation of sub-page-size in the last entry. * 3. Context pointers are used by the driver to pass to FW / update * the context (for the other ILT clients the pointers are used just to * free the memory during unload). */ context_size = (sizeof(union cdu_context) * BXE_L2_CID_COUNT(sc)); for (i = 0, allocated = 0; allocated < context_size; i++) { sc->context[i].size = min(CDU_ILT_PAGE_SZ, (context_size - allocated)); if (bxe_dma_alloc(sc, sc->context[i].size, &sc->context[i].vcxt_dma, "cdu context") != 0) { bxe_free_mem(sc); return (-1); } sc->context[i].vcxt = (union cdu_context *)sc->context[i].vcxt_dma.vaddr; allocated += sc->context[i].size; } bxe_alloc_ilt_lines_mem(sc); BLOGD(sc, DBG_LOAD, "ilt=%p start_line=%u lines=%p\n", sc->ilt, sc->ilt->start_line, sc->ilt->lines); { for (i = 0; i < 4; i++) { BLOGD(sc, DBG_LOAD, "c%d page_size=%u start=%u end=%u num=%u flags=0x%x\n", i, sc->ilt->clients[i].page_size, sc->ilt->clients[i].start, sc->ilt->clients[i].end, sc->ilt->clients[i].client_num, sc->ilt->clients[i].flags); } } if (ecore_ilt_mem_op(sc, ILT_MEMOP_ALLOC)) { BLOGE(sc, "ecore_ilt_mem_op ILT_MEMOP_ALLOC failed\n"); bxe_free_mem(sc); return (-1); } #if 0 if (bxe_iov_alloc_mem(sc)) { BLOGE(sc, "Failed to allocate memory for SRIOV\n"); bxe_free_mem(sc); return (-1); } #endif return (0); } static void bxe_free_rx_bd_chain(struct bxe_fastpath *fp) { struct bxe_softc *sc; int i; sc = fp->sc; if (fp->rx_mbuf_tag == NULL) { return; } /* free all mbufs and unload all maps */ for (i = 0; i < RX_BD_TOTAL; i++) { if (fp->rx_mbuf_chain[i].m_map != NULL) { bus_dmamap_sync(fp->rx_mbuf_tag, fp->rx_mbuf_chain[i].m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_mbuf_tag, fp->rx_mbuf_chain[i].m_map); } if (fp->rx_mbuf_chain[i].m != NULL) { m_freem(fp->rx_mbuf_chain[i].m); fp->rx_mbuf_chain[i].m = NULL; fp->eth_q_stats.mbuf_alloc_rx--; } } } static void bxe_free_tpa_pool(struct bxe_fastpath *fp) { struct bxe_softc *sc; int i, max_agg_queues; sc = fp->sc; if (fp->rx_mbuf_tag == NULL) { return; } max_agg_queues = MAX_AGG_QS(sc); /* release all mbufs and unload all DMA maps in the TPA pool */ for (i = 0; i < max_agg_queues; i++) { if (fp->rx_tpa_info[i].bd.m_map != NULL) { bus_dmamap_sync(fp->rx_mbuf_tag, fp->rx_tpa_info[i].bd.m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_mbuf_tag, fp->rx_tpa_info[i].bd.m_map); } if (fp->rx_tpa_info[i].bd.m != NULL) { m_freem(fp->rx_tpa_info[i].bd.m); fp->rx_tpa_info[i].bd.m = NULL; fp->eth_q_stats.mbuf_alloc_tpa--; } } } static void bxe_free_sge_chain(struct bxe_fastpath *fp) { struct bxe_softc *sc; int i; sc = fp->sc; if (fp->rx_sge_mbuf_tag == NULL) { return; } /* rree all mbufs and unload all maps */ for (i = 0; i < RX_SGE_TOTAL; i++) { if (fp->rx_sge_mbuf_chain[i].m_map != NULL) { bus_dmamap_sync(fp->rx_sge_mbuf_tag, fp->rx_sge_mbuf_chain[i].m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_sge_mbuf_tag, fp->rx_sge_mbuf_chain[i].m_map); } if (fp->rx_sge_mbuf_chain[i].m != NULL) { m_freem(fp->rx_sge_mbuf_chain[i].m); fp->rx_sge_mbuf_chain[i].m = NULL; fp->eth_q_stats.mbuf_alloc_sge--; } } } static void bxe_free_fp_buffers(struct bxe_softc *sc) { struct bxe_fastpath *fp; int i; for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; #if __FreeBSD_version >= 800000 if (fp->tx_br != NULL) { struct mbuf *m; /* just in case bxe_mq_flush() wasn't called */ while ((m = buf_ring_dequeue_sc(fp->tx_br)) != NULL) { m_freem(m); } buf_ring_free(fp->tx_br, M_DEVBUF); fp->tx_br = NULL; } #endif /* free all RX buffers */ bxe_free_rx_bd_chain(fp); bxe_free_tpa_pool(fp); bxe_free_sge_chain(fp); if (fp->eth_q_stats.mbuf_alloc_rx != 0) { BLOGE(sc, "failed to claim all rx mbufs (%d left)\n", fp->eth_q_stats.mbuf_alloc_rx); } if (fp->eth_q_stats.mbuf_alloc_sge != 0) { BLOGE(sc, "failed to claim all sge mbufs (%d left)\n", fp->eth_q_stats.mbuf_alloc_sge); } if (fp->eth_q_stats.mbuf_alloc_tpa != 0) { BLOGE(sc, "failed to claim all sge mbufs (%d left)\n", fp->eth_q_stats.mbuf_alloc_tpa); } if (fp->eth_q_stats.mbuf_alloc_tx != 0) { BLOGE(sc, "failed to release tx mbufs (%d left)\n", fp->eth_q_stats.mbuf_alloc_tx); } /* XXX verify all mbufs were reclaimed */ if (mtx_initialized(&fp->tx_mtx)) { mtx_destroy(&fp->tx_mtx); } if (mtx_initialized(&fp->rx_mtx)) { mtx_destroy(&fp->rx_mtx); } } } static int bxe_alloc_rx_bd_mbuf(struct bxe_fastpath *fp, uint16_t prev_index, uint16_t index) { struct bxe_sw_rx_bd *rx_buf; struct eth_rx_bd *rx_bd; bus_dma_segment_t segs[1]; bus_dmamap_t map; struct mbuf *m; int nsegs, rc; rc = 0; /* allocate the new RX BD mbuf */ m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, fp->mbuf_alloc_size); if (__predict_false(m == NULL)) { fp->eth_q_stats.mbuf_rx_bd_alloc_failed++; return (ENOBUFS); } fp->eth_q_stats.mbuf_alloc_rx++; /* initialize the mbuf buffer length */ m->m_pkthdr.len = m->m_len = fp->rx_buf_size; /* map the mbuf into non-paged pool */ rc = bus_dmamap_load_mbuf_sg(fp->rx_mbuf_tag, fp->rx_mbuf_spare_map, m, segs, &nsegs, BUS_DMA_NOWAIT); if (__predict_false(rc != 0)) { fp->eth_q_stats.mbuf_rx_bd_mapping_failed++; m_freem(m); fp->eth_q_stats.mbuf_alloc_rx--; return (rc); } /* all mbufs must map to a single segment */ KASSERT((nsegs == 1), ("Too many segments, %d returned!", nsegs)); /* release any existing RX BD mbuf mappings */ if (prev_index != index) { rx_buf = &fp->rx_mbuf_chain[prev_index]; if (rx_buf->m_map != NULL) { bus_dmamap_sync(fp->rx_mbuf_tag, rx_buf->m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_mbuf_tag, rx_buf->m_map); } /* * We only get here from bxe_rxeof() when the maximum number * of rx buffers is less than RX_BD_USABLE. bxe_rxeof() already * holds the mbuf in the prev_index so it's OK to NULL it out * here without concern of a memory leak. */ fp->rx_mbuf_chain[prev_index].m = NULL; } rx_buf = &fp->rx_mbuf_chain[index]; if (rx_buf->m_map != NULL) { bus_dmamap_sync(fp->rx_mbuf_tag, rx_buf->m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_mbuf_tag, rx_buf->m_map); } /* save the mbuf and mapping info for a future packet */ map = (prev_index != index) ? fp->rx_mbuf_chain[prev_index].m_map : rx_buf->m_map; rx_buf->m_map = fp->rx_mbuf_spare_map; fp->rx_mbuf_spare_map = map; bus_dmamap_sync(fp->rx_mbuf_tag, rx_buf->m_map, BUS_DMASYNC_PREREAD); rx_buf->m = m; rx_bd = &fp->rx_chain[index]; rx_bd->addr_hi = htole32(U64_HI(segs[0].ds_addr)); rx_bd->addr_lo = htole32(U64_LO(segs[0].ds_addr)); return (rc); } static int bxe_alloc_rx_tpa_mbuf(struct bxe_fastpath *fp, int queue) { struct bxe_sw_tpa_info *tpa_info = &fp->rx_tpa_info[queue]; bus_dma_segment_t segs[1]; bus_dmamap_t map; struct mbuf *m; int nsegs; int rc = 0; /* allocate the new TPA mbuf */ m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, fp->mbuf_alloc_size); if (__predict_false(m == NULL)) { fp->eth_q_stats.mbuf_rx_tpa_alloc_failed++; return (ENOBUFS); } fp->eth_q_stats.mbuf_alloc_tpa++; /* initialize the mbuf buffer length */ m->m_pkthdr.len = m->m_len = fp->rx_buf_size; /* map the mbuf into non-paged pool */ rc = bus_dmamap_load_mbuf_sg(fp->rx_mbuf_tag, fp->rx_tpa_info_mbuf_spare_map, m, segs, &nsegs, BUS_DMA_NOWAIT); if (__predict_false(rc != 0)) { fp->eth_q_stats.mbuf_rx_tpa_mapping_failed++; m_free(m); fp->eth_q_stats.mbuf_alloc_tpa--; return (rc); } /* all mbufs must map to a single segment */ KASSERT((nsegs == 1), ("Too many segments, %d returned!", nsegs)); /* release any existing TPA mbuf mapping */ if (tpa_info->bd.m_map != NULL) { bus_dmamap_sync(fp->rx_mbuf_tag, tpa_info->bd.m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_mbuf_tag, tpa_info->bd.m_map); } /* save the mbuf and mapping info for the TPA mbuf */ map = tpa_info->bd.m_map; tpa_info->bd.m_map = fp->rx_tpa_info_mbuf_spare_map; fp->rx_tpa_info_mbuf_spare_map = map; bus_dmamap_sync(fp->rx_mbuf_tag, tpa_info->bd.m_map, BUS_DMASYNC_PREREAD); tpa_info->bd.m = m; tpa_info->seg = segs[0]; return (rc); } /* * Allocate an mbuf and assign it to the receive scatter gather chain. The * caller must take care to save a copy of the existing mbuf in the SG mbuf * chain. */ static int bxe_alloc_rx_sge_mbuf(struct bxe_fastpath *fp, uint16_t index) { struct bxe_sw_rx_bd *sge_buf; struct eth_rx_sge *sge; bus_dma_segment_t segs[1]; bus_dmamap_t map; struct mbuf *m; int nsegs; int rc = 0; /* allocate a new SGE mbuf */ m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, SGE_PAGE_SIZE); if (__predict_false(m == NULL)) { fp->eth_q_stats.mbuf_rx_sge_alloc_failed++; return (ENOMEM); } fp->eth_q_stats.mbuf_alloc_sge++; /* initialize the mbuf buffer length */ m->m_pkthdr.len = m->m_len = SGE_PAGE_SIZE; /* map the SGE mbuf into non-paged pool */ rc = bus_dmamap_load_mbuf_sg(fp->rx_sge_mbuf_tag, fp->rx_sge_mbuf_spare_map, m, segs, &nsegs, BUS_DMA_NOWAIT); if (__predict_false(rc != 0)) { fp->eth_q_stats.mbuf_rx_sge_mapping_failed++; m_freem(m); fp->eth_q_stats.mbuf_alloc_sge--; return (rc); } /* all mbufs must map to a single segment */ KASSERT((nsegs == 1), ("Too many segments, %d returned!", nsegs)); sge_buf = &fp->rx_sge_mbuf_chain[index]; /* release any existing SGE mbuf mapping */ if (sge_buf->m_map != NULL) { bus_dmamap_sync(fp->rx_sge_mbuf_tag, sge_buf->m_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fp->rx_sge_mbuf_tag, sge_buf->m_map); } /* save the mbuf and mapping info for a future packet */ map = sge_buf->m_map; sge_buf->m_map = fp->rx_sge_mbuf_spare_map; fp->rx_sge_mbuf_spare_map = map; bus_dmamap_sync(fp->rx_sge_mbuf_tag, sge_buf->m_map, BUS_DMASYNC_PREREAD); sge_buf->m = m; sge = &fp->rx_sge_chain[index]; sge->addr_hi = htole32(U64_HI(segs[0].ds_addr)); sge->addr_lo = htole32(U64_LO(segs[0].ds_addr)); return (rc); } static __noinline int bxe_alloc_fp_buffers(struct bxe_softc *sc) { struct bxe_fastpath *fp; int i, j, rc = 0; int ring_prod, cqe_ring_prod; int max_agg_queues; for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; #if __FreeBSD_version >= 800000 fp->tx_br = buf_ring_alloc(BXE_BR_SIZE, M_DEVBUF, M_NOWAIT, &fp->tx_mtx); if (fp->tx_br == NULL) { BLOGE(sc, "buf_ring alloc fail for fp[%02d]\n", i); goto bxe_alloc_fp_buffers_error; } #endif ring_prod = cqe_ring_prod = 0; fp->rx_bd_cons = 0; fp->rx_cq_cons = 0; /* allocate buffers for the RX BDs in RX BD chain */ for (j = 0; j < sc->max_rx_bufs; j++) { rc = bxe_alloc_rx_bd_mbuf(fp, ring_prod, ring_prod); if (rc != 0) { BLOGE(sc, "mbuf alloc fail for fp[%02d] rx chain (%d)\n", i, rc); goto bxe_alloc_fp_buffers_error; } ring_prod = RX_BD_NEXT(ring_prod); cqe_ring_prod = RCQ_NEXT(cqe_ring_prod); } fp->rx_bd_prod = ring_prod; fp->rx_cq_prod = cqe_ring_prod; fp->eth_q_stats.rx_calls = fp->eth_q_stats.rx_pkts = 0; if (if_getcapenable(sc->ifp) & IFCAP_LRO) { max_agg_queues = MAX_AGG_QS(sc); fp->tpa_enable = TRUE; /* fill the TPA pool */ for (j = 0; j < max_agg_queues; j++) { rc = bxe_alloc_rx_tpa_mbuf(fp, j); if (rc != 0) { BLOGE(sc, "mbuf alloc fail for fp[%02d] TPA queue %d\n", i, j); fp->tpa_enable = FALSE; goto bxe_alloc_fp_buffers_error; } fp->rx_tpa_info[j].state = BXE_TPA_STATE_STOP; } if (fp->tpa_enable) { /* fill the RX SGE chain */ ring_prod = 0; for (j = 0; j < RX_SGE_USABLE; j++) { rc = bxe_alloc_rx_sge_mbuf(fp, ring_prod); if (rc != 0) { BLOGE(sc, "mbuf alloc fail for fp[%02d] SGE %d\n", i, ring_prod); fp->tpa_enable = FALSE; ring_prod = 0; goto bxe_alloc_fp_buffers_error; } ring_prod = RX_SGE_NEXT(ring_prod); } fp->rx_sge_prod = ring_prod; } } } return (0); bxe_alloc_fp_buffers_error: /* unwind what was already allocated */ bxe_free_rx_bd_chain(fp); bxe_free_tpa_pool(fp); bxe_free_sge_chain(fp); return (ENOBUFS); } static void bxe_free_fw_stats_mem(struct bxe_softc *sc) { bxe_dma_free(sc, &sc->fw_stats_dma); sc->fw_stats_num = 0; sc->fw_stats_req_size = 0; sc->fw_stats_req = NULL; sc->fw_stats_req_mapping = 0; sc->fw_stats_data_size = 0; sc->fw_stats_data = NULL; sc->fw_stats_data_mapping = 0; } static int bxe_alloc_fw_stats_mem(struct bxe_softc *sc) { uint8_t num_queue_stats; int num_groups; /* number of queues for statistics is number of eth queues */ num_queue_stats = BXE_NUM_ETH_QUEUES(sc); /* * Total number of FW statistics requests = * 1 for port stats + 1 for PF stats + num of queues */ sc->fw_stats_num = (2 + num_queue_stats); /* * Request is built from stats_query_header and an array of * stats_query_cmd_group each of which contains STATS_QUERY_CMD_COUNT * rules. The real number or requests is configured in the * stats_query_header. */ num_groups = ((sc->fw_stats_num / STATS_QUERY_CMD_COUNT) + ((sc->fw_stats_num % STATS_QUERY_CMD_COUNT) ? 1 : 0)); BLOGD(sc, DBG_LOAD, "stats fw_stats_num %d num_groups %d\n", sc->fw_stats_num, num_groups); sc->fw_stats_req_size = (sizeof(struct stats_query_header) + (num_groups * sizeof(struct stats_query_cmd_group))); /* * Data for statistics requests + stats_counter. * stats_counter holds per-STORM counters that are incremented when * STORM has finished with the current request. Memory for FCoE * offloaded statistics are counted anyway, even if they will not be sent. * VF stats are not accounted for here as the data of VF stats is stored * in memory allocated by the VF, not here. */ sc->fw_stats_data_size = (sizeof(struct stats_counter) + sizeof(struct per_port_stats) + sizeof(struct per_pf_stats) + /* sizeof(struct fcoe_statistics_params) + */ (sizeof(struct per_queue_stats) * num_queue_stats)); if (bxe_dma_alloc(sc, (sc->fw_stats_req_size + sc->fw_stats_data_size), &sc->fw_stats_dma, "fw stats") != 0) { bxe_free_fw_stats_mem(sc); return (-1); } /* set up the shortcuts */ sc->fw_stats_req = (struct bxe_fw_stats_req *)sc->fw_stats_dma.vaddr; sc->fw_stats_req_mapping = sc->fw_stats_dma.paddr; sc->fw_stats_data = (struct bxe_fw_stats_data *)((uint8_t *)sc->fw_stats_dma.vaddr + sc->fw_stats_req_size); sc->fw_stats_data_mapping = (sc->fw_stats_dma.paddr + sc->fw_stats_req_size); BLOGD(sc, DBG_LOAD, "statistics request base address set to %#jx\n", (uintmax_t)sc->fw_stats_req_mapping); BLOGD(sc, DBG_LOAD, "statistics data base address set to %#jx\n", (uintmax_t)sc->fw_stats_data_mapping); return (0); } /* * Bits map: * 0-7 - Engine0 load counter. * 8-15 - Engine1 load counter. * 16 - Engine0 RESET_IN_PROGRESS bit. * 17 - Engine1 RESET_IN_PROGRESS bit. * 18 - Engine0 ONE_IS_LOADED. Set when there is at least one active * function on the engine * 19 - Engine1 ONE_IS_LOADED. * 20 - Chip reset flow bit. When set none-leader must wait for both engines * leader to complete (check for both RESET_IN_PROGRESS bits and not * for just the one belonging to its engine). */ #define BXE_RECOVERY_GLOB_REG MISC_REG_GENERIC_POR_1 #define BXE_PATH0_LOAD_CNT_MASK 0x000000ff #define BXE_PATH0_LOAD_CNT_SHIFT 0 #define BXE_PATH1_LOAD_CNT_MASK 0x0000ff00 #define BXE_PATH1_LOAD_CNT_SHIFT 8 #define BXE_PATH0_RST_IN_PROG_BIT 0x00010000 #define BXE_PATH1_RST_IN_PROG_BIT 0x00020000 #define BXE_GLOBAL_RESET_BIT 0x00040000 /* set the GLOBAL_RESET bit, should be run under rtnl lock */ static void bxe_set_reset_global(struct bxe_softc *sc) { uint32_t val; bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); REG_WR(sc, BXE_RECOVERY_GLOB_REG, val | BXE_GLOBAL_RESET_BIT); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); } /* clear the GLOBAL_RESET bit, should be run under rtnl lock */ static void bxe_clear_reset_global(struct bxe_softc *sc) { uint32_t val; bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); REG_WR(sc, BXE_RECOVERY_GLOB_REG, val & (~BXE_GLOBAL_RESET_BIT)); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); } /* checks the GLOBAL_RESET bit, should be run under rtnl lock */ static uint8_t bxe_reset_is_global(struct bxe_softc *sc) { uint32_t val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); BLOGD(sc, DBG_LOAD, "GLOB_REG=0x%08x\n", val); return (val & BXE_GLOBAL_RESET_BIT) ? TRUE : FALSE; } /* clear RESET_IN_PROGRESS bit for the engine, should be run under rtnl lock */ static void bxe_set_reset_done(struct bxe_softc *sc) { uint32_t val; uint32_t bit = SC_PATH(sc) ? BXE_PATH1_RST_IN_PROG_BIT : BXE_PATH0_RST_IN_PROG_BIT; bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); /* Clear the bit */ val &= ~bit; REG_WR(sc, BXE_RECOVERY_GLOB_REG, val); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); } /* set RESET_IN_PROGRESS for the engine, should be run under rtnl lock */ static void bxe_set_reset_in_progress(struct bxe_softc *sc) { uint32_t val; uint32_t bit = SC_PATH(sc) ? BXE_PATH1_RST_IN_PROG_BIT : BXE_PATH0_RST_IN_PROG_BIT; bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); /* Set the bit */ val |= bit; REG_WR(sc, BXE_RECOVERY_GLOB_REG, val); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); } /* check RESET_IN_PROGRESS bit for an engine, should be run under rtnl lock */ static uint8_t bxe_reset_is_done(struct bxe_softc *sc, int engine) { uint32_t val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); uint32_t bit = engine ? BXE_PATH1_RST_IN_PROG_BIT : BXE_PATH0_RST_IN_PROG_BIT; /* return false if bit is set */ return (val & bit) ? FALSE : TRUE; } /* get the load status for an engine, should be run under rtnl lock */ static uint8_t bxe_get_load_status(struct bxe_softc *sc, int engine) { uint32_t mask = engine ? BXE_PATH1_LOAD_CNT_MASK : BXE_PATH0_LOAD_CNT_MASK; uint32_t shift = engine ? BXE_PATH1_LOAD_CNT_SHIFT : BXE_PATH0_LOAD_CNT_SHIFT; uint32_t val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); BLOGD(sc, DBG_LOAD, "Old value for GLOB_REG=0x%08x\n", val); val = ((val & mask) >> shift); BLOGD(sc, DBG_LOAD, "Load mask engine %d = 0x%08x\n", engine, val); return (val != 0); } /* set pf load mark */ /* XXX needs to be under rtnl lock */ static void bxe_set_pf_load(struct bxe_softc *sc) { uint32_t val; uint32_t val1; uint32_t mask = SC_PATH(sc) ? BXE_PATH1_LOAD_CNT_MASK : BXE_PATH0_LOAD_CNT_MASK; uint32_t shift = SC_PATH(sc) ? BXE_PATH1_LOAD_CNT_SHIFT : BXE_PATH0_LOAD_CNT_SHIFT; bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); BLOGD(sc, DBG_LOAD, "Old value for GLOB_REG=0x%08x\n", val); /* get the current counter value */ val1 = ((val & mask) >> shift); /* set bit of this PF */ val1 |= (1 << SC_ABS_FUNC(sc)); /* clear the old value */ val &= ~mask; /* set the new one */ val |= ((val1 << shift) & mask); REG_WR(sc, BXE_RECOVERY_GLOB_REG, val); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); } /* clear pf load mark */ /* XXX needs to be under rtnl lock */ static uint8_t bxe_clear_pf_load(struct bxe_softc *sc) { uint32_t val1, val; uint32_t mask = SC_PATH(sc) ? BXE_PATH1_LOAD_CNT_MASK : BXE_PATH0_LOAD_CNT_MASK; uint32_t shift = SC_PATH(sc) ? BXE_PATH1_LOAD_CNT_SHIFT : BXE_PATH0_LOAD_CNT_SHIFT; bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); val = REG_RD(sc, BXE_RECOVERY_GLOB_REG); BLOGD(sc, DBG_LOAD, "Old GEN_REG_VAL=0x%08x\n", val); /* get the current counter value */ val1 = (val & mask) >> shift; /* clear bit of that PF */ val1 &= ~(1 << SC_ABS_FUNC(sc)); /* clear the old value */ val &= ~mask; /* set the new one */ val |= ((val1 << shift) & mask); REG_WR(sc, BXE_RECOVERY_GLOB_REG, val); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RECOVERY_REG); return (val1 != 0); } /* send load requrest to mcp and analyze response */ static int bxe_nic_load_request(struct bxe_softc *sc, uint32_t *load_code) { /* init fw_seq */ sc->fw_seq = (SHMEM_RD(sc, func_mb[SC_FW_MB_IDX(sc)].drv_mb_header) & DRV_MSG_SEQ_NUMBER_MASK); BLOGD(sc, DBG_LOAD, "initial fw_seq 0x%04x\n", sc->fw_seq); /* get the current FW pulse sequence */ sc->fw_drv_pulse_wr_seq = (SHMEM_RD(sc, func_mb[SC_FW_MB_IDX(sc)].drv_pulse_mb) & DRV_PULSE_SEQ_MASK); BLOGD(sc, DBG_LOAD, "initial drv_pulse 0x%04x\n", sc->fw_drv_pulse_wr_seq); /* load request */ (*load_code) = bxe_fw_command(sc, DRV_MSG_CODE_LOAD_REQ, DRV_MSG_CODE_LOAD_REQ_WITH_LFA); /* if the MCP fails to respond we must abort */ if (!(*load_code)) { BLOGE(sc, "MCP response failure!\n"); return (-1); } /* if MCP refused then must abort */ if ((*load_code) == FW_MSG_CODE_DRV_LOAD_REFUSED) { BLOGE(sc, "MCP refused load request\n"); return (-1); } return (0); } /* * Check whether another PF has already loaded FW to chip. In virtualized * environments a pf from anoth VM may have already initialized the device * including loading FW. */ static int bxe_nic_load_analyze_req(struct bxe_softc *sc, uint32_t load_code) { uint32_t my_fw, loaded_fw; /* is another pf loaded on this engine? */ if ((load_code != FW_MSG_CODE_DRV_LOAD_COMMON_CHIP) && (load_code != FW_MSG_CODE_DRV_LOAD_COMMON)) { /* build my FW version dword */ my_fw = (BCM_5710_FW_MAJOR_VERSION + (BCM_5710_FW_MINOR_VERSION << 8 ) + (BCM_5710_FW_REVISION_VERSION << 16) + (BCM_5710_FW_ENGINEERING_VERSION << 24)); /* read loaded FW from chip */ loaded_fw = REG_RD(sc, XSEM_REG_PRAM); BLOGD(sc, DBG_LOAD, "loaded FW 0x%08x / my FW 0x%08x\n", loaded_fw, my_fw); /* abort nic load if version mismatch */ if (my_fw != loaded_fw) { BLOGE(sc, "FW 0x%08x already loaded (mine is 0x%08x)", loaded_fw, my_fw); return (-1); } } return (0); } /* mark PMF if applicable */ static void bxe_nic_load_pmf(struct bxe_softc *sc, uint32_t load_code) { uint32_t ncsi_oem_data_addr; if ((load_code == FW_MSG_CODE_DRV_LOAD_COMMON) || (load_code == FW_MSG_CODE_DRV_LOAD_COMMON_CHIP) || (load_code == FW_MSG_CODE_DRV_LOAD_PORT)) { /* * Barrier here for ordering between the writing to sc->port.pmf here * and reading it from the periodic task. */ sc->port.pmf = 1; mb(); } else { sc->port.pmf = 0; } BLOGD(sc, DBG_LOAD, "pmf %d\n", sc->port.pmf); /* XXX needed? */ if (load_code == FW_MSG_CODE_DRV_LOAD_COMMON_CHIP) { if (SHMEM2_HAS(sc, ncsi_oem_data_addr)) { ncsi_oem_data_addr = SHMEM2_RD(sc, ncsi_oem_data_addr); if (ncsi_oem_data_addr) { REG_WR(sc, (ncsi_oem_data_addr + offsetof(struct glob_ncsi_oem_data, driver_version)), 0); } } } } static void bxe_read_mf_cfg(struct bxe_softc *sc) { int n = (CHIP_IS_MODE_4_PORT(sc) ? 2 : 1); int abs_func; int vn; if (BXE_NOMCP(sc)) { return; /* what should be the default bvalue in this case */ } /* * The formula for computing the absolute function number is... * For 2 port configuration (4 functions per port): * abs_func = 2 * vn + SC_PORT + SC_PATH * For 4 port configuration (2 functions per port): * abs_func = 4 * vn + 2 * SC_PORT + SC_PATH */ for (vn = VN_0; vn < SC_MAX_VN_NUM(sc); vn++) { abs_func = (n * (2 * vn + SC_PORT(sc)) + SC_PATH(sc)); if (abs_func >= E1H_FUNC_MAX) { break; } sc->devinfo.mf_info.mf_config[vn] = MFCFG_RD(sc, func_mf_config[abs_func].config); } if (sc->devinfo.mf_info.mf_config[SC_VN(sc)] & FUNC_MF_CFG_FUNC_DISABLED) { BLOGD(sc, DBG_LOAD, "mf_cfg function disabled\n"); sc->flags |= BXE_MF_FUNC_DIS; } else { BLOGD(sc, DBG_LOAD, "mf_cfg function enabled\n"); sc->flags &= ~BXE_MF_FUNC_DIS; } } /* acquire split MCP access lock register */ static int bxe_acquire_alr(struct bxe_softc *sc) { uint32_t j, val; for (j = 0; j < 1000; j++) { val = (1UL << 31); REG_WR(sc, GRCBASE_MCP + 0x9c, val); val = REG_RD(sc, GRCBASE_MCP + 0x9c); if (val & (1L << 31)) break; DELAY(5000); } if (!(val & (1L << 31))) { BLOGE(sc, "Cannot acquire MCP access lock register\n"); return (-1); } return (0); } /* release split MCP access lock register */ static void bxe_release_alr(struct bxe_softc *sc) { REG_WR(sc, GRCBASE_MCP + 0x9c, 0); } static void bxe_fan_failure(struct bxe_softc *sc) { int port = SC_PORT(sc); uint32_t ext_phy_config; /* mark the failure */ ext_phy_config = SHMEM_RD(sc, dev_info.port_hw_config[port].external_phy_config); ext_phy_config &= ~PORT_HW_CFG_XGXS_EXT_PHY_TYPE_MASK; ext_phy_config |= PORT_HW_CFG_XGXS_EXT_PHY_TYPE_FAILURE; SHMEM_WR(sc, dev_info.port_hw_config[port].external_phy_config, ext_phy_config); /* log the failure */ BLOGW(sc, "Fan Failure has caused the driver to shutdown " "the card to prevent permanent damage. " "Please contact OEM Support for assistance\n"); /* XXX */ #if 1 bxe_panic(sc, ("Schedule task to handle fan failure\n")); #else /* * Schedule device reset (unload) * This is due to some boards consuming sufficient power when driver is * up to overheat if fan fails. */ bxe_set_bit(BXE_SP_RTNL_FAN_FAILURE, &sc->sp_rtnl_state); schedule_delayed_work(&sc->sp_rtnl_task, 0); #endif } /* this function is called upon a link interrupt */ static void bxe_link_attn(struct bxe_softc *sc) { uint32_t pause_enabled = 0; struct host_port_stats *pstats; int cmng_fns; /* Make sure that we are synced with the current statistics */ bxe_stats_handle(sc, STATS_EVENT_STOP); elink_link_update(&sc->link_params, &sc->link_vars); if (sc->link_vars.link_up) { /* dropless flow control */ if (!CHIP_IS_E1(sc) && sc->dropless_fc) { pause_enabled = 0; if (sc->link_vars.flow_ctrl & ELINK_FLOW_CTRL_TX) { pause_enabled = 1; } REG_WR(sc, (BAR_USTRORM_INTMEM + USTORM_ETH_PAUSE_ENABLED_OFFSET(SC_PORT(sc))), pause_enabled); } if (sc->link_vars.mac_type != ELINK_MAC_TYPE_EMAC) { pstats = BXE_SP(sc, port_stats); /* reset old mac stats */ memset(&(pstats->mac_stx[0]), 0, sizeof(struct mac_stx)); } if (sc->state == BXE_STATE_OPEN) { bxe_stats_handle(sc, STATS_EVENT_LINK_UP); } } if (sc->link_vars.link_up && sc->link_vars.line_speed) { cmng_fns = bxe_get_cmng_fns_mode(sc); if (cmng_fns != CMNG_FNS_NONE) { bxe_cmng_fns_init(sc, FALSE, cmng_fns); storm_memset_cmng(sc, &sc->cmng, SC_PORT(sc)); } else { /* rate shaping and fairness are disabled */ BLOGD(sc, DBG_LOAD, "single function mode without fairness\n"); } } bxe_link_report_locked(sc); if (IS_MF(sc)) { ; // XXX bxe_link_sync_notify(sc); } } static void bxe_attn_int_asserted(struct bxe_softc *sc, uint32_t asserted) { int port = SC_PORT(sc); uint32_t aeu_addr = port ? MISC_REG_AEU_MASK_ATTN_FUNC_1 : MISC_REG_AEU_MASK_ATTN_FUNC_0; uint32_t nig_int_mask_addr = port ? NIG_REG_MASK_INTERRUPT_PORT1 : NIG_REG_MASK_INTERRUPT_PORT0; uint32_t aeu_mask; uint32_t nig_mask = 0; uint32_t reg_addr; uint32_t igu_acked; uint32_t cnt; if (sc->attn_state & asserted) { BLOGE(sc, "IGU ERROR attn=0x%08x\n", asserted); } bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_PORT0_ATT_MASK + port); aeu_mask = REG_RD(sc, aeu_addr); BLOGD(sc, DBG_INTR, "aeu_mask 0x%08x newly asserted 0x%08x\n", aeu_mask, asserted); aeu_mask &= ~(asserted & 0x3ff); BLOGD(sc, DBG_INTR, "new mask 0x%08x\n", aeu_mask); REG_WR(sc, aeu_addr, aeu_mask); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_PORT0_ATT_MASK + port); BLOGD(sc, DBG_INTR, "attn_state 0x%08x\n", sc->attn_state); sc->attn_state |= asserted; BLOGD(sc, DBG_INTR, "new state 0x%08x\n", sc->attn_state); if (asserted & ATTN_HARD_WIRED_MASK) { if (asserted & ATTN_NIG_FOR_FUNC) { BXE_PHY_LOCK(sc); /* save nig interrupt mask */ nig_mask = REG_RD(sc, nig_int_mask_addr); /* If nig_mask is not set, no need to call the update function */ if (nig_mask) { REG_WR(sc, nig_int_mask_addr, 0); bxe_link_attn(sc); } /* handle unicore attn? */ } if (asserted & ATTN_SW_TIMER_4_FUNC) { BLOGD(sc, DBG_INTR, "ATTN_SW_TIMER_4_FUNC!\n"); } if (asserted & GPIO_2_FUNC) { BLOGD(sc, DBG_INTR, "GPIO_2_FUNC!\n"); } if (asserted & GPIO_3_FUNC) { BLOGD(sc, DBG_INTR, "GPIO_3_FUNC!\n"); } if (asserted & GPIO_4_FUNC) { BLOGD(sc, DBG_INTR, "GPIO_4_FUNC!\n"); } if (port == 0) { if (asserted & ATTN_GENERAL_ATTN_1) { BLOGD(sc, DBG_INTR, "ATTN_GENERAL_ATTN_1!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_1, 0x0); } if (asserted & ATTN_GENERAL_ATTN_2) { BLOGD(sc, DBG_INTR, "ATTN_GENERAL_ATTN_2!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_2, 0x0); } if (asserted & ATTN_GENERAL_ATTN_3) { BLOGD(sc, DBG_INTR, "ATTN_GENERAL_ATTN_3!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_3, 0x0); } } else { if (asserted & ATTN_GENERAL_ATTN_4) { BLOGD(sc, DBG_INTR, "ATTN_GENERAL_ATTN_4!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_4, 0x0); } if (asserted & ATTN_GENERAL_ATTN_5) { BLOGD(sc, DBG_INTR, "ATTN_GENERAL_ATTN_5!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_5, 0x0); } if (asserted & ATTN_GENERAL_ATTN_6) { BLOGD(sc, DBG_INTR, "ATTN_GENERAL_ATTN_6!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_6, 0x0); } } } /* hardwired */ if (sc->devinfo.int_block == INT_BLOCK_HC) { reg_addr = (HC_REG_COMMAND_REG + port*32 + COMMAND_REG_ATTN_BITS_SET); } else { reg_addr = (BAR_IGU_INTMEM + IGU_CMD_ATTN_BIT_SET_UPPER*8); } BLOGD(sc, DBG_INTR, "about to mask 0x%08x at %s addr 0x%08x\n", asserted, (sc->devinfo.int_block == INT_BLOCK_HC) ? "HC" : "IGU", reg_addr); REG_WR(sc, reg_addr, asserted); /* now set back the mask */ if (asserted & ATTN_NIG_FOR_FUNC) { /* * Verify that IGU ack through BAR was written before restoring * NIG mask. This loop should exit after 2-3 iterations max. */ if (sc->devinfo.int_block != INT_BLOCK_HC) { cnt = 0; do { igu_acked = REG_RD(sc, IGU_REG_ATTENTION_ACK_BITS); } while (((igu_acked & ATTN_NIG_FOR_FUNC) == 0) && (++cnt < MAX_IGU_ATTN_ACK_TO)); if (!igu_acked) { BLOGE(sc, "Failed to verify IGU ack on time\n"); } mb(); } REG_WR(sc, nig_int_mask_addr, nig_mask); BXE_PHY_UNLOCK(sc); } } static void bxe_print_next_block(struct bxe_softc *sc, int idx, const char *blk) { BLOGI(sc, "%s%s", idx ? ", " : "", blk); } static int bxe_check_blocks_with_parity0(struct bxe_softc *sc, uint32_t sig, int par_num, uint8_t print) { uint32_t cur_bit = 0; int i = 0; for (i = 0; sig; i++) { cur_bit = ((uint32_t)0x1 << i); if (sig & cur_bit) { switch (cur_bit) { case AEU_INPUTS_ATTN_BITS_BRB_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "BRB"); break; case AEU_INPUTS_ATTN_BITS_PARSER_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "PARSER"); break; case AEU_INPUTS_ATTN_BITS_TSDM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "TSDM"); break; case AEU_INPUTS_ATTN_BITS_SEARCHER_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "SEARCHER"); break; case AEU_INPUTS_ATTN_BITS_TCM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "TCM"); break; case AEU_INPUTS_ATTN_BITS_TSEMI_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "TSEMI"); break; case AEU_INPUTS_ATTN_BITS_PBCLIENT_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "XPB"); break; } /* Clear the bit */ sig &= ~cur_bit; } } return (par_num); } static int bxe_check_blocks_with_parity1(struct bxe_softc *sc, uint32_t sig, int par_num, uint8_t *global, uint8_t print) { int i = 0; uint32_t cur_bit = 0; for (i = 0; sig; i++) { cur_bit = ((uint32_t)0x1 << i); if (sig & cur_bit) { switch (cur_bit) { case AEU_INPUTS_ATTN_BITS_PBF_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "PBF"); break; case AEU_INPUTS_ATTN_BITS_QM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "QM"); break; case AEU_INPUTS_ATTN_BITS_TIMERS_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "TM"); break; case AEU_INPUTS_ATTN_BITS_XSDM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "XSDM"); break; case AEU_INPUTS_ATTN_BITS_XCM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "XCM"); break; case AEU_INPUTS_ATTN_BITS_XSEMI_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "XSEMI"); break; case AEU_INPUTS_ATTN_BITS_DOORBELLQ_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "DOORBELLQ"); break; case AEU_INPUTS_ATTN_BITS_NIG_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "NIG"); break; case AEU_INPUTS_ATTN_BITS_VAUX_PCI_CORE_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "VAUX PCI CORE"); *global = TRUE; break; case AEU_INPUTS_ATTN_BITS_DEBUG_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "DEBUG"); break; case AEU_INPUTS_ATTN_BITS_USDM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "USDM"); break; case AEU_INPUTS_ATTN_BITS_UCM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "UCM"); break; case AEU_INPUTS_ATTN_BITS_USEMI_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "USEMI"); break; case AEU_INPUTS_ATTN_BITS_UPB_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "UPB"); break; case AEU_INPUTS_ATTN_BITS_CSDM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "CSDM"); break; case AEU_INPUTS_ATTN_BITS_CCM_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "CCM"); break; } /* Clear the bit */ sig &= ~cur_bit; } } return (par_num); } static int bxe_check_blocks_with_parity2(struct bxe_softc *sc, uint32_t sig, int par_num, uint8_t print) { uint32_t cur_bit = 0; int i = 0; for (i = 0; sig; i++) { cur_bit = ((uint32_t)0x1 << i); if (sig & cur_bit) { switch (cur_bit) { case AEU_INPUTS_ATTN_BITS_CSEMI_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "CSEMI"); break; case AEU_INPUTS_ATTN_BITS_PXP_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "PXP"); break; case AEU_IN_ATTN_BITS_PXPPCICLOCKCLIENT_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "PXPPCICLOCKCLIENT"); break; case AEU_INPUTS_ATTN_BITS_CFC_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "CFC"); break; case AEU_INPUTS_ATTN_BITS_CDU_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "CDU"); break; case AEU_INPUTS_ATTN_BITS_DMAE_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "DMAE"); break; case AEU_INPUTS_ATTN_BITS_IGU_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "IGU"); break; case AEU_INPUTS_ATTN_BITS_MISC_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "MISC"); break; } /* Clear the bit */ sig &= ~cur_bit; } } return (par_num); } static int bxe_check_blocks_with_parity3(struct bxe_softc *sc, uint32_t sig, int par_num, uint8_t *global, uint8_t print) { uint32_t cur_bit = 0; int i = 0; for (i = 0; sig; i++) { cur_bit = ((uint32_t)0x1 << i); if (sig & cur_bit) { switch (cur_bit) { case AEU_INPUTS_ATTN_BITS_MCP_LATCHED_ROM_PARITY: if (print) bxe_print_next_block(sc, par_num++, "MCP ROM"); *global = TRUE; break; case AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_RX_PARITY: if (print) bxe_print_next_block(sc, par_num++, "MCP UMP RX"); *global = TRUE; break; case AEU_INPUTS_ATTN_BITS_MCP_LATCHED_UMP_TX_PARITY: if (print) bxe_print_next_block(sc, par_num++, "MCP UMP TX"); *global = TRUE; break; case AEU_INPUTS_ATTN_BITS_MCP_LATCHED_SCPAD_PARITY: if (print) bxe_print_next_block(sc, par_num++, "MCP SCPAD"); *global = TRUE; break; } /* Clear the bit */ sig &= ~cur_bit; } } return (par_num); } static int bxe_check_blocks_with_parity4(struct bxe_softc *sc, uint32_t sig, int par_num, uint8_t print) { uint32_t cur_bit = 0; int i = 0; for (i = 0; sig; i++) { cur_bit = ((uint32_t)0x1 << i); if (sig & cur_bit) { switch (cur_bit) { case AEU_INPUTS_ATTN_BITS_PGLUE_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "PGLUE_B"); break; case AEU_INPUTS_ATTN_BITS_ATC_PARITY_ERROR: if (print) bxe_print_next_block(sc, par_num++, "ATC"); break; } /* Clear the bit */ sig &= ~cur_bit; } } return (par_num); } static uint8_t bxe_parity_attn(struct bxe_softc *sc, uint8_t *global, uint8_t print, uint32_t *sig) { int par_num = 0; if ((sig[0] & HW_PRTY_ASSERT_SET_0) || (sig[1] & HW_PRTY_ASSERT_SET_1) || (sig[2] & HW_PRTY_ASSERT_SET_2) || (sig[3] & HW_PRTY_ASSERT_SET_3) || (sig[4] & HW_PRTY_ASSERT_SET_4)) { BLOGE(sc, "Parity error: HW block parity attention:\n" "[0]:0x%08x [1]:0x%08x [2]:0x%08x [3]:0x%08x [4]:0x%08x\n", (uint32_t)(sig[0] & HW_PRTY_ASSERT_SET_0), (uint32_t)(sig[1] & HW_PRTY_ASSERT_SET_1), (uint32_t)(sig[2] & HW_PRTY_ASSERT_SET_2), (uint32_t)(sig[3] & HW_PRTY_ASSERT_SET_3), (uint32_t)(sig[4] & HW_PRTY_ASSERT_SET_4)); if (print) BLOGI(sc, "Parity errors detected in blocks: "); par_num = bxe_check_blocks_with_parity0(sc, sig[0] & HW_PRTY_ASSERT_SET_0, par_num, print); par_num = bxe_check_blocks_with_parity1(sc, sig[1] & HW_PRTY_ASSERT_SET_1, par_num, global, print); par_num = bxe_check_blocks_with_parity2(sc, sig[2] & HW_PRTY_ASSERT_SET_2, par_num, print); par_num = bxe_check_blocks_with_parity3(sc, sig[3] & HW_PRTY_ASSERT_SET_3, par_num, global, print); par_num = bxe_check_blocks_with_parity4(sc, sig[4] & HW_PRTY_ASSERT_SET_4, par_num, print); if (print) BLOGI(sc, "\n"); return (TRUE); } return (FALSE); } static uint8_t bxe_chk_parity_attn(struct bxe_softc *sc, uint8_t *global, uint8_t print) { struct attn_route attn = { {0} }; int port = SC_PORT(sc); attn.sig[0] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_1_FUNC_0 + port*4); attn.sig[1] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_2_FUNC_0 + port*4); attn.sig[2] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_3_FUNC_0 + port*4); attn.sig[3] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_4_FUNC_0 + port*4); if (!CHIP_IS_E1x(sc)) attn.sig[4] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_5_FUNC_0 + port*4); return (bxe_parity_attn(sc, global, print, attn.sig)); } static void bxe_attn_int_deasserted4(struct bxe_softc *sc, uint32_t attn) { uint32_t val; if (attn & AEU_INPUTS_ATTN_BITS_PGLUE_HW_INTERRUPT) { val = REG_RD(sc, PGLUE_B_REG_PGLUE_B_INT_STS_CLR); BLOGE(sc, "PGLUE hw attention 0x%08x\n", val); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_ADDRESS_ERROR) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_ADDRESS_ERROR\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_INCORRECT_RCV_BEHAVIOR) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_INCORRECT_RCV_BEHAVIOR\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_WAS_ERROR_ATTN) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_WAS_ERROR_ATTN\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_VF_LENGTH_VIOLATION_ATTN) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_VF_LENGTH_VIOLATION_ATTN\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_VF_GRC_SPACE_VIOLATION_ATTN) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_VF_GRC_SPACE_VIOLATION_ATTN\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_VF_MSIX_BAR_VIOLATION_ATTN) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_VF_MSIX_BAR_VIOLATION_ATTN\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_TCPL_ERROR_ATTN) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_TCPL_ERROR_ATTN\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_TCPL_IN_TWO_RCBS_ATTN) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_TCPL_IN_TWO_RCBS_ATTN\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_CSSNOOP_FIFO_OVERFLOW) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_CSSNOOP_FIFO_OVERFLOW\n"); } if (attn & AEU_INPUTS_ATTN_BITS_ATC_HW_INTERRUPT) { val = REG_RD(sc, ATC_REG_ATC_INT_STS_CLR); BLOGE(sc, "ATC hw attention 0x%08x\n", val); if (val & ATC_ATC_INT_STS_REG_ADDRESS_ERROR) BLOGE(sc, "ATC_ATC_INT_STS_REG_ADDRESS_ERROR\n"); if (val & ATC_ATC_INT_STS_REG_ATC_TCPL_TO_NOT_PEND) BLOGE(sc, "ATC_ATC_INT_STS_REG_ATC_TCPL_TO_NOT_PEND\n"); if (val & ATC_ATC_INT_STS_REG_ATC_GPA_MULTIPLE_HITS) BLOGE(sc, "ATC_ATC_INT_STS_REG_ATC_GPA_MULTIPLE_HITS\n"); if (val & ATC_ATC_INT_STS_REG_ATC_RCPL_TO_EMPTY_CNT) BLOGE(sc, "ATC_ATC_INT_STS_REG_ATC_RCPL_TO_EMPTY_CNT\n"); if (val & ATC_ATC_INT_STS_REG_ATC_TCPL_ERROR) BLOGE(sc, "ATC_ATC_INT_STS_REG_ATC_TCPL_ERROR\n"); if (val & ATC_ATC_INT_STS_REG_ATC_IREQ_LESS_THAN_STU) BLOGE(sc, "ATC_ATC_INT_STS_REG_ATC_IREQ_LESS_THAN_STU\n"); } if (attn & (AEU_INPUTS_ATTN_BITS_PGLUE_PARITY_ERROR | AEU_INPUTS_ATTN_BITS_ATC_PARITY_ERROR)) { BLOGE(sc, "FATAL parity attention set4 0x%08x\n", (uint32_t)(attn & (AEU_INPUTS_ATTN_BITS_PGLUE_PARITY_ERROR | AEU_INPUTS_ATTN_BITS_ATC_PARITY_ERROR))); } } static void bxe_e1h_disable(struct bxe_softc *sc) { int port = SC_PORT(sc); bxe_tx_disable(sc); REG_WR(sc, NIG_REG_LLH0_FUNC_EN + port*8, 0); } static void bxe_e1h_enable(struct bxe_softc *sc) { int port = SC_PORT(sc); REG_WR(sc, NIG_REG_LLH0_FUNC_EN + port*8, 1); // XXX bxe_tx_enable(sc); } /* * called due to MCP event (on pmf): * reread new bandwidth configuration * configure FW * notify others function about the change */ static void bxe_config_mf_bw(struct bxe_softc *sc) { if (sc->link_vars.link_up) { bxe_cmng_fns_init(sc, TRUE, CMNG_FNS_MINMAX); // XXX bxe_link_sync_notify(sc); } storm_memset_cmng(sc, &sc->cmng, SC_PORT(sc)); } static void bxe_set_mf_bw(struct bxe_softc *sc) { bxe_config_mf_bw(sc); bxe_fw_command(sc, DRV_MSG_CODE_SET_MF_BW_ACK, 0); } static void bxe_handle_eee_event(struct bxe_softc *sc) { BLOGD(sc, DBG_INTR, "EEE - LLDP event\n"); bxe_fw_command(sc, DRV_MSG_CODE_EEE_RESULTS_ACK, 0); } #define DRV_INFO_ETH_STAT_NUM_MACS_REQUIRED 3 static void bxe_drv_info_ether_stat(struct bxe_softc *sc) { struct eth_stats_info *ether_stat = &sc->sp->drv_info_to_mcp.ether_stat; strlcpy(ether_stat->version, BXE_DRIVER_VERSION, ETH_STAT_INFO_VERSION_LEN); /* XXX (+ MAC_PAD) taken from other driver... verify this is right */ sc->sp_objs[0].mac_obj.get_n_elements(sc, &sc->sp_objs[0].mac_obj, DRV_INFO_ETH_STAT_NUM_MACS_REQUIRED, ether_stat->mac_local + MAC_PAD, MAC_PAD, ETH_ALEN); ether_stat->mtu_size = sc->mtu; ether_stat->feature_flags |= FEATURE_ETH_CHKSUM_OFFLOAD_MASK; if (if_getcapenable(sc->ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) { ether_stat->feature_flags |= FEATURE_ETH_LSO_MASK; } // XXX ether_stat->feature_flags |= ???; ether_stat->promiscuous_mode = 0; // (flags & PROMISC) ? 1 : 0; ether_stat->txq_size = sc->tx_ring_size; ether_stat->rxq_size = sc->rx_ring_size; } static void bxe_handle_drv_info_req(struct bxe_softc *sc) { enum drv_info_opcode op_code; uint32_t drv_info_ctl = SHMEM2_RD(sc, drv_info_control); /* if drv_info version supported by MFW doesn't match - send NACK */ if ((drv_info_ctl & DRV_INFO_CONTROL_VER_MASK) != DRV_INFO_CUR_VER) { bxe_fw_command(sc, DRV_MSG_CODE_DRV_INFO_NACK, 0); return; } op_code = ((drv_info_ctl & DRV_INFO_CONTROL_OP_CODE_MASK) >> DRV_INFO_CONTROL_OP_CODE_SHIFT); memset(&sc->sp->drv_info_to_mcp, 0, sizeof(union drv_info_to_mcp)); switch (op_code) { case ETH_STATS_OPCODE: bxe_drv_info_ether_stat(sc); break; case FCOE_STATS_OPCODE: case ISCSI_STATS_OPCODE: default: /* if op code isn't supported - send NACK */ bxe_fw_command(sc, DRV_MSG_CODE_DRV_INFO_NACK, 0); return; } /* * If we got drv_info attn from MFW then these fields are defined in * shmem2 for sure */ SHMEM2_WR(sc, drv_info_host_addr_lo, U64_LO(BXE_SP_MAPPING(sc, drv_info_to_mcp))); SHMEM2_WR(sc, drv_info_host_addr_hi, U64_HI(BXE_SP_MAPPING(sc, drv_info_to_mcp))); bxe_fw_command(sc, DRV_MSG_CODE_DRV_INFO_ACK, 0); } static void bxe_dcc_event(struct bxe_softc *sc, uint32_t dcc_event) { BLOGD(sc, DBG_INTR, "dcc_event 0x%08x\n", dcc_event); if (dcc_event & DRV_STATUS_DCC_DISABLE_ENABLE_PF) { /* * This is the only place besides the function initialization * where the sc->flags can change so it is done without any * locks */ if (sc->devinfo.mf_info.mf_config[SC_VN(sc)] & FUNC_MF_CFG_FUNC_DISABLED) { BLOGD(sc, DBG_INTR, "mf_cfg function disabled\n"); sc->flags |= BXE_MF_FUNC_DIS; bxe_e1h_disable(sc); } else { BLOGD(sc, DBG_INTR, "mf_cfg function enabled\n"); sc->flags &= ~BXE_MF_FUNC_DIS; bxe_e1h_enable(sc); } dcc_event &= ~DRV_STATUS_DCC_DISABLE_ENABLE_PF; } if (dcc_event & DRV_STATUS_DCC_BANDWIDTH_ALLOCATION) { bxe_config_mf_bw(sc); dcc_event &= ~DRV_STATUS_DCC_BANDWIDTH_ALLOCATION; } /* Report results to MCP */ if (dcc_event) bxe_fw_command(sc, DRV_MSG_CODE_DCC_FAILURE, 0); else bxe_fw_command(sc, DRV_MSG_CODE_DCC_OK, 0); } static void bxe_pmf_update(struct bxe_softc *sc) { int port = SC_PORT(sc); uint32_t val; sc->port.pmf = 1; BLOGD(sc, DBG_INTR, "pmf %d\n", sc->port.pmf); /* * We need the mb() to ensure the ordering between the writing to * sc->port.pmf here and reading it from the bxe_periodic_task(). */ mb(); /* queue a periodic task */ // XXX schedule task... // XXX bxe_dcbx_pmf_update(sc); /* enable nig attention */ val = (0xff0f | (1 << (SC_VN(sc) + 4))); if (sc->devinfo.int_block == INT_BLOCK_HC) { REG_WR(sc, HC_REG_TRAILING_EDGE_0 + port*8, val); REG_WR(sc, HC_REG_LEADING_EDGE_0 + port*8, val); } else if (!CHIP_IS_E1x(sc)) { REG_WR(sc, IGU_REG_TRAILING_EDGE_LATCH, val); REG_WR(sc, IGU_REG_LEADING_EDGE_LATCH, val); } bxe_stats_handle(sc, STATS_EVENT_PMF); } static int bxe_mc_assert(struct bxe_softc *sc) { char last_idx; int i, rc = 0; uint32_t row0, row1, row2, row3; /* XSTORM */ last_idx = REG_RD8(sc, BAR_XSTRORM_INTMEM + XSTORM_ASSERT_LIST_INDEX_OFFSET); if (last_idx) BLOGE(sc, "XSTORM_ASSERT_LIST_INDEX 0x%x\n", last_idx); /* print the asserts */ for (i = 0; i < STORM_ASSERT_ARRAY_SIZE; i++) { row0 = REG_RD(sc, BAR_XSTRORM_INTMEM + XSTORM_ASSERT_LIST_OFFSET(i)); row1 = REG_RD(sc, BAR_XSTRORM_INTMEM + XSTORM_ASSERT_LIST_OFFSET(i) + 4); row2 = REG_RD(sc, BAR_XSTRORM_INTMEM + XSTORM_ASSERT_LIST_OFFSET(i) + 8); row3 = REG_RD(sc, BAR_XSTRORM_INTMEM + XSTORM_ASSERT_LIST_OFFSET(i) + 12); if (row0 != COMMON_ASM_INVALID_ASSERT_OPCODE) { BLOGE(sc, "XSTORM_ASSERT_INDEX 0x%x = 0x%08x 0x%08x 0x%08x 0x%08x\n", i, row3, row2, row1, row0); rc++; } else { break; } } /* TSTORM */ last_idx = REG_RD8(sc, BAR_TSTRORM_INTMEM + TSTORM_ASSERT_LIST_INDEX_OFFSET); if (last_idx) { BLOGE(sc, "TSTORM_ASSERT_LIST_INDEX 0x%x\n", last_idx); } /* print the asserts */ for (i = 0; i < STORM_ASSERT_ARRAY_SIZE; i++) { row0 = REG_RD(sc, BAR_TSTRORM_INTMEM + TSTORM_ASSERT_LIST_OFFSET(i)); row1 = REG_RD(sc, BAR_TSTRORM_INTMEM + TSTORM_ASSERT_LIST_OFFSET(i) + 4); row2 = REG_RD(sc, BAR_TSTRORM_INTMEM + TSTORM_ASSERT_LIST_OFFSET(i) + 8); row3 = REG_RD(sc, BAR_TSTRORM_INTMEM + TSTORM_ASSERT_LIST_OFFSET(i) + 12); if (row0 != COMMON_ASM_INVALID_ASSERT_OPCODE) { BLOGE(sc, "TSTORM_ASSERT_INDEX 0x%x = 0x%08x 0x%08x 0x%08x 0x%08x\n", i, row3, row2, row1, row0); rc++; } else { break; } } /* CSTORM */ last_idx = REG_RD8(sc, BAR_CSTRORM_INTMEM + CSTORM_ASSERT_LIST_INDEX_OFFSET); if (last_idx) { BLOGE(sc, "CSTORM_ASSERT_LIST_INDEX 0x%x\n", last_idx); } /* print the asserts */ for (i = 0; i < STORM_ASSERT_ARRAY_SIZE; i++) { row0 = REG_RD(sc, BAR_CSTRORM_INTMEM + CSTORM_ASSERT_LIST_OFFSET(i)); row1 = REG_RD(sc, BAR_CSTRORM_INTMEM + CSTORM_ASSERT_LIST_OFFSET(i) + 4); row2 = REG_RD(sc, BAR_CSTRORM_INTMEM + CSTORM_ASSERT_LIST_OFFSET(i) + 8); row3 = REG_RD(sc, BAR_CSTRORM_INTMEM + CSTORM_ASSERT_LIST_OFFSET(i) + 12); if (row0 != COMMON_ASM_INVALID_ASSERT_OPCODE) { BLOGE(sc, "CSTORM_ASSERT_INDEX 0x%x = 0x%08x 0x%08x 0x%08x 0x%08x\n", i, row3, row2, row1, row0); rc++; } else { break; } } /* USTORM */ last_idx = REG_RD8(sc, BAR_USTRORM_INTMEM + USTORM_ASSERT_LIST_INDEX_OFFSET); if (last_idx) { BLOGE(sc, "USTORM_ASSERT_LIST_INDEX 0x%x\n", last_idx); } /* print the asserts */ for (i = 0; i < STORM_ASSERT_ARRAY_SIZE; i++) { row0 = REG_RD(sc, BAR_USTRORM_INTMEM + USTORM_ASSERT_LIST_OFFSET(i)); row1 = REG_RD(sc, BAR_USTRORM_INTMEM + USTORM_ASSERT_LIST_OFFSET(i) + 4); row2 = REG_RD(sc, BAR_USTRORM_INTMEM + USTORM_ASSERT_LIST_OFFSET(i) + 8); row3 = REG_RD(sc, BAR_USTRORM_INTMEM + USTORM_ASSERT_LIST_OFFSET(i) + 12); if (row0 != COMMON_ASM_INVALID_ASSERT_OPCODE) { BLOGE(sc, "USTORM_ASSERT_INDEX 0x%x = 0x%08x 0x%08x 0x%08x 0x%08x\n", i, row3, row2, row1, row0); rc++; } else { break; } } return (rc); } static void bxe_attn_int_deasserted3(struct bxe_softc *sc, uint32_t attn) { int func = SC_FUNC(sc); uint32_t val; if (attn & EVEREST_GEN_ATTN_IN_USE_MASK) { if (attn & BXE_PMF_LINK_ASSERT(sc)) { REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_12 + func*4, 0); bxe_read_mf_cfg(sc); sc->devinfo.mf_info.mf_config[SC_VN(sc)] = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].config); val = SHMEM_RD(sc, func_mb[SC_FW_MB_IDX(sc)].drv_status); if (val & DRV_STATUS_DCC_EVENT_MASK) bxe_dcc_event(sc, (val & DRV_STATUS_DCC_EVENT_MASK)); if (val & DRV_STATUS_SET_MF_BW) bxe_set_mf_bw(sc); if (val & DRV_STATUS_DRV_INFO_REQ) bxe_handle_drv_info_req(sc); #if 0 if (val & DRV_STATUS_VF_DISABLED) bxe_vf_handle_flr_event(sc); #endif if ((sc->port.pmf == 0) && (val & DRV_STATUS_PMF)) bxe_pmf_update(sc); #if 0 if (sc->port.pmf && (val & DRV_STATUS_DCBX_NEGOTIATION_RESULTS) && (sc->dcbx_enabled > 0)) /* start dcbx state machine */ bxe_dcbx_set_params(sc, BXE_DCBX_STATE_NEG_RECEIVED); #endif #if 0 if (val & DRV_STATUS_AFEX_EVENT_MASK) bxe_handle_afex_cmd(sc, val & DRV_STATUS_AFEX_EVENT_MASK); #endif if (val & DRV_STATUS_EEE_NEGOTIATION_RESULTS) bxe_handle_eee_event(sc); if (sc->link_vars.periodic_flags & ELINK_PERIODIC_FLAGS_LINK_EVENT) { /* sync with link */ BXE_PHY_LOCK(sc); sc->link_vars.periodic_flags &= ~ELINK_PERIODIC_FLAGS_LINK_EVENT; BXE_PHY_UNLOCK(sc); if (IS_MF(sc)) ; // XXX bxe_link_sync_notify(sc); bxe_link_report(sc); } /* * Always call it here: bxe_link_report() will * prevent the link indication duplication. */ bxe_link_status_update(sc); } else if (attn & BXE_MC_ASSERT_BITS) { BLOGE(sc, "MC assert!\n"); bxe_mc_assert(sc); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_10, 0); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_9, 0); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_8, 0); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_7, 0); bxe_panic(sc, ("MC assert!\n")); } else if (attn & BXE_MCP_ASSERT) { BLOGE(sc, "MCP assert!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_11, 0); // XXX bxe_fw_dump(sc); } else { BLOGE(sc, "Unknown HW assert! (attn 0x%08x)\n", attn); } } if (attn & EVEREST_LATCHED_ATTN_IN_USE_MASK) { BLOGE(sc, "LATCHED attention 0x%08x (masked)\n", attn); if (attn & BXE_GRC_TIMEOUT) { val = CHIP_IS_E1(sc) ? 0 : REG_RD(sc, MISC_REG_GRC_TIMEOUT_ATTN); BLOGE(sc, "GRC time-out 0x%08x\n", val); } if (attn & BXE_GRC_RSV) { val = CHIP_IS_E1(sc) ? 0 : REG_RD(sc, MISC_REG_GRC_RSV_ATTN); BLOGE(sc, "GRC reserved 0x%08x\n", val); } REG_WR(sc, MISC_REG_AEU_CLR_LATCH_SIGNAL, 0x7ff); } } static void bxe_attn_int_deasserted2(struct bxe_softc *sc, uint32_t attn) { int port = SC_PORT(sc); int reg_offset; uint32_t val0, mask0, val1, mask1; uint32_t val; if (attn & AEU_INPUTS_ATTN_BITS_CFC_HW_INTERRUPT) { val = REG_RD(sc, CFC_REG_CFC_INT_STS_CLR); BLOGE(sc, "CFC hw attention 0x%08x\n", val); /* CFC error attention */ if (val & 0x2) { BLOGE(sc, "FATAL error from CFC\n"); } } if (attn & AEU_INPUTS_ATTN_BITS_PXP_HW_INTERRUPT) { val = REG_RD(sc, PXP_REG_PXP_INT_STS_CLR_0); BLOGE(sc, "PXP hw attention-0 0x%08x\n", val); /* RQ_USDMDP_FIFO_OVERFLOW */ if (val & 0x18000) { BLOGE(sc, "FATAL error from PXP\n"); } if (!CHIP_IS_E1x(sc)) { val = REG_RD(sc, PXP_REG_PXP_INT_STS_CLR_1); BLOGE(sc, "PXP hw attention-1 0x%08x\n", val); } } #define PXP2_EOP_ERROR_BIT PXP2_PXP2_INT_STS_CLR_0_REG_WR_PGLUE_EOP_ERROR #define AEU_PXP2_HW_INT_BIT AEU_INPUTS_ATTN_BITS_PXPPCICLOCKCLIENT_HW_INTERRUPT if (attn & AEU_PXP2_HW_INT_BIT) { /* CQ47854 workaround do not panic on * PXP2_PXP2_INT_STS_0_REG_WR_PGLUE_EOP_ERROR */ if (!CHIP_IS_E1x(sc)) { mask0 = REG_RD(sc, PXP2_REG_PXP2_INT_MASK_0); val1 = REG_RD(sc, PXP2_REG_PXP2_INT_STS_1); mask1 = REG_RD(sc, PXP2_REG_PXP2_INT_MASK_1); val0 = REG_RD(sc, PXP2_REG_PXP2_INT_STS_0); /* * If the olny PXP2_EOP_ERROR_BIT is set in * STS0 and STS1 - clear it * * probably we lose additional attentions between * STS0 and STS_CLR0, in this case user will not * be notified about them */ if (val0 & mask0 & PXP2_EOP_ERROR_BIT && !(val1 & mask1)) val0 = REG_RD(sc, PXP2_REG_PXP2_INT_STS_CLR_0); /* print the register, since no one can restore it */ BLOGE(sc, "PXP2_REG_PXP2_INT_STS_CLR_0 0x%08x\n", val0); /* * if PXP2_PXP2_INT_STS_0_REG_WR_PGLUE_EOP_ERROR * then notify */ if (val0 & PXP2_EOP_ERROR_BIT) { BLOGE(sc, "PXP2_WR_PGLUE_EOP_ERROR\n"); /* * if only PXP2_PXP2_INT_STS_0_REG_WR_PGLUE_EOP_ERROR is * set then clear attention from PXP2 block without panic */ if (((val0 & mask0) == PXP2_EOP_ERROR_BIT) && ((val1 & mask1) == 0)) attn &= ~AEU_PXP2_HW_INT_BIT; } } } if (attn & HW_INTERRUT_ASSERT_SET_2) { reg_offset = (port ? MISC_REG_AEU_ENABLE1_FUNC_1_OUT_2 : MISC_REG_AEU_ENABLE1_FUNC_0_OUT_2); val = REG_RD(sc, reg_offset); val &= ~(attn & HW_INTERRUT_ASSERT_SET_2); REG_WR(sc, reg_offset, val); BLOGE(sc, "FATAL HW block attention set2 0x%x\n", (uint32_t)(attn & HW_INTERRUT_ASSERT_SET_2)); bxe_panic(sc, ("HW block attention set2\n")); } } static void bxe_attn_int_deasserted1(struct bxe_softc *sc, uint32_t attn) { int port = SC_PORT(sc); int reg_offset; uint32_t val; if (attn & AEU_INPUTS_ATTN_BITS_DOORBELLQ_HW_INTERRUPT) { val = REG_RD(sc, DORQ_REG_DORQ_INT_STS_CLR); BLOGE(sc, "DB hw attention 0x%08x\n", val); /* DORQ discard attention */ if (val & 0x2) { BLOGE(sc, "FATAL error from DORQ\n"); } } if (attn & HW_INTERRUT_ASSERT_SET_1) { reg_offset = (port ? MISC_REG_AEU_ENABLE1_FUNC_1_OUT_1 : MISC_REG_AEU_ENABLE1_FUNC_0_OUT_1); val = REG_RD(sc, reg_offset); val &= ~(attn & HW_INTERRUT_ASSERT_SET_1); REG_WR(sc, reg_offset, val); BLOGE(sc, "FATAL HW block attention set1 0x%08x\n", (uint32_t)(attn & HW_INTERRUT_ASSERT_SET_1)); bxe_panic(sc, ("HW block attention set1\n")); } } static void bxe_attn_int_deasserted0(struct bxe_softc *sc, uint32_t attn) { int port = SC_PORT(sc); int reg_offset; uint32_t val; reg_offset = (port) ? MISC_REG_AEU_ENABLE1_FUNC_1_OUT_0 : MISC_REG_AEU_ENABLE1_FUNC_0_OUT_0; if (attn & AEU_INPUTS_ATTN_BITS_SPIO5) { val = REG_RD(sc, reg_offset); val &= ~AEU_INPUTS_ATTN_BITS_SPIO5; REG_WR(sc, reg_offset, val); BLOGW(sc, "SPIO5 hw attention\n"); /* Fan failure attention */ elink_hw_reset_phy(&sc->link_params); bxe_fan_failure(sc); } if ((attn & sc->link_vars.aeu_int_mask) && sc->port.pmf) { BXE_PHY_LOCK(sc); elink_handle_module_detect_int(&sc->link_params); BXE_PHY_UNLOCK(sc); } if (attn & HW_INTERRUT_ASSERT_SET_0) { val = REG_RD(sc, reg_offset); val &= ~(attn & HW_INTERRUT_ASSERT_SET_0); REG_WR(sc, reg_offset, val); bxe_panic(sc, ("FATAL HW block attention set0 0x%lx\n", (attn & HW_INTERRUT_ASSERT_SET_0))); } } static void bxe_attn_int_deasserted(struct bxe_softc *sc, uint32_t deasserted) { struct attn_route attn; struct attn_route *group_mask; int port = SC_PORT(sc); int index; uint32_t reg_addr; uint32_t val; uint32_t aeu_mask; uint8_t global = FALSE; /* * Need to take HW lock because MCP or other port might also * try to handle this event. */ bxe_acquire_alr(sc); if (bxe_chk_parity_attn(sc, &global, TRUE)) { /* XXX * In case of parity errors don't handle attentions so that * other function would "see" parity errors. */ sc->recovery_state = BXE_RECOVERY_INIT; // XXX schedule a recovery task... /* disable HW interrupts */ bxe_int_disable(sc); bxe_release_alr(sc); return; } attn.sig[0] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_1_FUNC_0 + port*4); attn.sig[1] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_2_FUNC_0 + port*4); attn.sig[2] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_3_FUNC_0 + port*4); attn.sig[3] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_4_FUNC_0 + port*4); if (!CHIP_IS_E1x(sc)) { attn.sig[4] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_5_FUNC_0 + port*4); } else { attn.sig[4] = 0; } BLOGD(sc, DBG_INTR, "attn: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n", attn.sig[0], attn.sig[1], attn.sig[2], attn.sig[3], attn.sig[4]); for (index = 0; index < MAX_DYNAMIC_ATTN_GRPS; index++) { if (deasserted & (1 << index)) { group_mask = &sc->attn_group[index]; BLOGD(sc, DBG_INTR, "group[%d]: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n", index, group_mask->sig[0], group_mask->sig[1], group_mask->sig[2], group_mask->sig[3], group_mask->sig[4]); bxe_attn_int_deasserted4(sc, attn.sig[4] & group_mask->sig[4]); bxe_attn_int_deasserted3(sc, attn.sig[3] & group_mask->sig[3]); bxe_attn_int_deasserted1(sc, attn.sig[1] & group_mask->sig[1]); bxe_attn_int_deasserted2(sc, attn.sig[2] & group_mask->sig[2]); bxe_attn_int_deasserted0(sc, attn.sig[0] & group_mask->sig[0]); } } bxe_release_alr(sc); if (sc->devinfo.int_block == INT_BLOCK_HC) { reg_addr = (HC_REG_COMMAND_REG + port*32 + COMMAND_REG_ATTN_BITS_CLR); } else { reg_addr = (BAR_IGU_INTMEM + IGU_CMD_ATTN_BIT_CLR_UPPER*8); } val = ~deasserted; BLOGD(sc, DBG_INTR, "about to mask 0x%08x at %s addr 0x%08x\n", val, (sc->devinfo.int_block == INT_BLOCK_HC) ? "HC" : "IGU", reg_addr); REG_WR(sc, reg_addr, val); if (~sc->attn_state & deasserted) { BLOGE(sc, "IGU error\n"); } reg_addr = port ? MISC_REG_AEU_MASK_ATTN_FUNC_1 : MISC_REG_AEU_MASK_ATTN_FUNC_0; bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_PORT0_ATT_MASK + port); aeu_mask = REG_RD(sc, reg_addr); BLOGD(sc, DBG_INTR, "aeu_mask 0x%08x newly deasserted 0x%08x\n", aeu_mask, deasserted); aeu_mask |= (deasserted & 0x3ff); BLOGD(sc, DBG_INTR, "new mask 0x%08x\n", aeu_mask); REG_WR(sc, reg_addr, aeu_mask); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_PORT0_ATT_MASK + port); BLOGD(sc, DBG_INTR, "attn_state 0x%08x\n", sc->attn_state); sc->attn_state &= ~deasserted; BLOGD(sc, DBG_INTR, "new state 0x%08x\n", sc->attn_state); } static void bxe_attn_int(struct bxe_softc *sc) { /* read local copy of bits */ uint32_t attn_bits = le32toh(sc->def_sb->atten_status_block.attn_bits); uint32_t attn_ack = le32toh(sc->def_sb->atten_status_block.attn_bits_ack); uint32_t attn_state = sc->attn_state; /* look for changed bits */ uint32_t asserted = attn_bits & ~attn_ack & ~attn_state; uint32_t deasserted = ~attn_bits & attn_ack & attn_state; BLOGD(sc, DBG_INTR, "attn_bits 0x%08x attn_ack 0x%08x asserted 0x%08x deasserted 0x%08x\n", attn_bits, attn_ack, asserted, deasserted); if (~(attn_bits ^ attn_ack) & (attn_bits ^ attn_state)) { BLOGE(sc, "BAD attention state\n"); } /* handle bits that were raised */ if (asserted) { bxe_attn_int_asserted(sc, asserted); } if (deasserted) { bxe_attn_int_deasserted(sc, deasserted); } } static uint16_t bxe_update_dsb_idx(struct bxe_softc *sc) { struct host_sp_status_block *def_sb = sc->def_sb; uint16_t rc = 0; mb(); /* status block is written to by the chip */ if (sc->def_att_idx != def_sb->atten_status_block.attn_bits_index) { sc->def_att_idx = def_sb->atten_status_block.attn_bits_index; rc |= BXE_DEF_SB_ATT_IDX; } if (sc->def_idx != def_sb->sp_sb.running_index) { sc->def_idx = def_sb->sp_sb.running_index; rc |= BXE_DEF_SB_IDX; } mb(); return (rc); } static inline struct ecore_queue_sp_obj * bxe_cid_to_q_obj(struct bxe_softc *sc, uint32_t cid) { BLOGD(sc, DBG_SP, "retrieving fp from cid %d\n", cid); return (&sc->sp_objs[CID_TO_FP(cid, sc)].q_obj); } static void bxe_handle_mcast_eqe(struct bxe_softc *sc) { struct ecore_mcast_ramrod_params rparam; int rc; memset(&rparam, 0, sizeof(rparam)); rparam.mcast_obj = &sc->mcast_obj; BXE_MCAST_LOCK(sc); /* clear pending state for the last command */ sc->mcast_obj.raw.clear_pending(&sc->mcast_obj.raw); /* if there are pending mcast commands - send them */ if (sc->mcast_obj.check_pending(&sc->mcast_obj)) { rc = ecore_config_mcast(sc, &rparam, ECORE_MCAST_CMD_CONT); if (rc < 0) { BLOGD(sc, DBG_SP, "ERROR: Failed to send pending mcast commands (%d)\n", rc); } } BXE_MCAST_UNLOCK(sc); } static void bxe_handle_classification_eqe(struct bxe_softc *sc, union event_ring_elem *elem) { unsigned long ramrod_flags = 0; int rc = 0; uint32_t cid = elem->message.data.eth_event.echo & BXE_SWCID_MASK; struct ecore_vlan_mac_obj *vlan_mac_obj; /* always push next commands out, don't wait here */ bit_set(&ramrod_flags, RAMROD_CONT); switch (le32toh(elem->message.data.eth_event.echo) >> BXE_SWCID_SHIFT) { case ECORE_FILTER_MAC_PENDING: BLOGD(sc, DBG_SP, "Got SETUP_MAC completions\n"); vlan_mac_obj = &sc->sp_objs[cid].mac_obj; break; case ECORE_FILTER_MCAST_PENDING: BLOGD(sc, DBG_SP, "Got SETUP_MCAST completions\n"); /* * This is only relevant for 57710 where multicast MACs are * configured as unicast MACs using the same ramrod. */ bxe_handle_mcast_eqe(sc); return; default: BLOGE(sc, "Unsupported classification command: %d\n", elem->message.data.eth_event.echo); return; } rc = vlan_mac_obj->complete(sc, vlan_mac_obj, elem, &ramrod_flags); if (rc < 0) { BLOGE(sc, "Failed to schedule new commands (%d)\n", rc); } else if (rc > 0) { BLOGD(sc, DBG_SP, "Scheduled next pending commands...\n"); } } static void bxe_handle_rx_mode_eqe(struct bxe_softc *sc, union event_ring_elem *elem) { bxe_clear_bit(ECORE_FILTER_RX_MODE_PENDING, &sc->sp_state); /* send rx_mode command again if was requested */ if (bxe_test_and_clear_bit(ECORE_FILTER_RX_MODE_SCHED, &sc->sp_state)) { bxe_set_storm_rx_mode(sc); } #if 0 else if (bxe_test_and_clear_bit(ECORE_FILTER_ISCSI_ETH_START_SCHED, &sc->sp_state)) { bxe_set_iscsi_eth_rx_mode(sc, TRUE); } else if (bxe_test_and_clear_bit(ECORE_FILTER_ISCSI_ETH_STOP_SCHED, &sc->sp_state)) { bxe_set_iscsi_eth_rx_mode(sc, FALSE); } #endif } static void bxe_update_eq_prod(struct bxe_softc *sc, uint16_t prod) { storm_memset_eq_prod(sc, prod, SC_FUNC(sc)); wmb(); /* keep prod updates ordered */ } static void bxe_eq_int(struct bxe_softc *sc) { uint16_t hw_cons, sw_cons, sw_prod; union event_ring_elem *elem; uint8_t echo; uint32_t cid; uint8_t opcode; int spqe_cnt = 0; struct ecore_queue_sp_obj *q_obj; struct ecore_func_sp_obj *f_obj = &sc->func_obj; struct ecore_raw_obj *rss_raw = &sc->rss_conf_obj.raw; hw_cons = le16toh(*sc->eq_cons_sb); /* * The hw_cons range is 1-255, 257 - the sw_cons range is 0-254, 256. * when we get to the next-page we need to adjust so the loop * condition below will be met. The next element is the size of a * regular element and hence incrementing by 1 */ if ((hw_cons & EQ_DESC_MAX_PAGE) == EQ_DESC_MAX_PAGE) { hw_cons++; } /* * This function may never run in parallel with itself for a * specific sc and no need for a read memory barrier here. */ sw_cons = sc->eq_cons; sw_prod = sc->eq_prod; BLOGD(sc, DBG_SP,"EQ: hw_cons=%u sw_cons=%u eq_spq_left=0x%lx\n", hw_cons, sw_cons, atomic_load_acq_long(&sc->eq_spq_left)); for (; sw_cons != hw_cons; sw_prod = NEXT_EQ_IDX(sw_prod), sw_cons = NEXT_EQ_IDX(sw_cons)) { elem = &sc->eq[EQ_DESC(sw_cons)]; #if 0 int rc; rc = bxe_iov_eq_sp_event(sc, elem); if (!rc) { BLOGE(sc, "bxe_iov_eq_sp_event returned %d\n", rc); goto next_spqe; } #endif /* elem CID originates from FW, actually LE */ cid = SW_CID(elem->message.data.cfc_del_event.cid); opcode = elem->message.opcode; /* handle eq element */ switch (opcode) { #if 0 case EVENT_RING_OPCODE_VF_PF_CHANNEL: BLOGD(sc, DBG_SP, "vf/pf channel element on eq\n"); bxe_vf_mbx(sc, &elem->message.data.vf_pf_event); continue; #endif case EVENT_RING_OPCODE_STAT_QUERY: BLOGD(sc, DBG_SP, "got statistics completion event %d\n", sc->stats_comp++); /* nothing to do with stats comp */ goto next_spqe; case EVENT_RING_OPCODE_CFC_DEL: /* handle according to cid range */ /* we may want to verify here that the sc state is HALTING */ BLOGD(sc, DBG_SP, "got delete ramrod for MULTI[%d]\n", cid); q_obj = bxe_cid_to_q_obj(sc, cid); if (q_obj->complete_cmd(sc, q_obj, ECORE_Q_CMD_CFC_DEL)) { break; } goto next_spqe; case EVENT_RING_OPCODE_STOP_TRAFFIC: BLOGD(sc, DBG_SP, "got STOP TRAFFIC\n"); if (f_obj->complete_cmd(sc, f_obj, ECORE_F_CMD_TX_STOP)) { break; } // XXX bxe_dcbx_set_params(sc, BXE_DCBX_STATE_TX_PAUSED); goto next_spqe; case EVENT_RING_OPCODE_START_TRAFFIC: BLOGD(sc, DBG_SP, "got START TRAFFIC\n"); if (f_obj->complete_cmd(sc, f_obj, ECORE_F_CMD_TX_START)) { break; } // XXX bxe_dcbx_set_params(sc, BXE_DCBX_STATE_TX_RELEASED); goto next_spqe; case EVENT_RING_OPCODE_FUNCTION_UPDATE: echo = elem->message.data.function_update_event.echo; if (echo == SWITCH_UPDATE) { BLOGD(sc, DBG_SP, "got FUNC_SWITCH_UPDATE ramrod\n"); if (f_obj->complete_cmd(sc, f_obj, ECORE_F_CMD_SWITCH_UPDATE)) { break; } } else { BLOGD(sc, DBG_SP, "AFEX: ramrod completed FUNCTION_UPDATE\n"); #if 0 f_obj->complete_cmd(sc, f_obj, ECORE_F_CMD_AFEX_UPDATE); /* * We will perform the queues update from the sp_core_task as * all queue SP operations should run with CORE_LOCK. */ bxe_set_bit(BXE_SP_CORE_AFEX_F_UPDATE, &sc->sp_core_state); taskqueue_enqueue(sc->sp_tq, &sc->sp_tq_task); #endif } goto next_spqe; #if 0 case EVENT_RING_OPCODE_AFEX_VIF_LISTS: f_obj->complete_cmd(sc, f_obj, ECORE_F_CMD_AFEX_VIFLISTS); bxe_after_afex_vif_lists(sc, elem); goto next_spqe; #endif case EVENT_RING_OPCODE_FORWARD_SETUP: q_obj = &bxe_fwd_sp_obj(sc, q_obj); if (q_obj->complete_cmd(sc, q_obj, ECORE_Q_CMD_SETUP_TX_ONLY)) { break; } goto next_spqe; case EVENT_RING_OPCODE_FUNCTION_START: BLOGD(sc, DBG_SP, "got FUNC_START ramrod\n"); if (f_obj->complete_cmd(sc, f_obj, ECORE_F_CMD_START)) { break; } goto next_spqe; case EVENT_RING_OPCODE_FUNCTION_STOP: BLOGD(sc, DBG_SP, "got FUNC_STOP ramrod\n"); if (f_obj->complete_cmd(sc, f_obj, ECORE_F_CMD_STOP)) { break; } goto next_spqe; } switch (opcode | sc->state) { case (EVENT_RING_OPCODE_RSS_UPDATE_RULES | BXE_STATE_OPEN): case (EVENT_RING_OPCODE_RSS_UPDATE_RULES | BXE_STATE_OPENING_WAITING_PORT): cid = elem->message.data.eth_event.echo & BXE_SWCID_MASK; BLOGD(sc, DBG_SP, "got RSS_UPDATE ramrod. CID %d\n", cid); rss_raw->clear_pending(rss_raw); break; case (EVENT_RING_OPCODE_SET_MAC | BXE_STATE_OPEN): case (EVENT_RING_OPCODE_SET_MAC | BXE_STATE_DIAG): case (EVENT_RING_OPCODE_SET_MAC | BXE_STATE_CLOSING_WAITING_HALT): case (EVENT_RING_OPCODE_CLASSIFICATION_RULES | BXE_STATE_OPEN): case (EVENT_RING_OPCODE_CLASSIFICATION_RULES | BXE_STATE_DIAG): case (EVENT_RING_OPCODE_CLASSIFICATION_RULES | BXE_STATE_CLOSING_WAITING_HALT): BLOGD(sc, DBG_SP, "got (un)set mac ramrod\n"); bxe_handle_classification_eqe(sc, elem); break; case (EVENT_RING_OPCODE_MULTICAST_RULES | BXE_STATE_OPEN): case (EVENT_RING_OPCODE_MULTICAST_RULES | BXE_STATE_DIAG): case (EVENT_RING_OPCODE_MULTICAST_RULES | BXE_STATE_CLOSING_WAITING_HALT): BLOGD(sc, DBG_SP, "got mcast ramrod\n"); bxe_handle_mcast_eqe(sc); break; case (EVENT_RING_OPCODE_FILTERS_RULES | BXE_STATE_OPEN): case (EVENT_RING_OPCODE_FILTERS_RULES | BXE_STATE_DIAG): case (EVENT_RING_OPCODE_FILTERS_RULES | BXE_STATE_CLOSING_WAITING_HALT): BLOGD(sc, DBG_SP, "got rx_mode ramrod\n"); bxe_handle_rx_mode_eqe(sc, elem); break; default: /* unknown event log error and continue */ BLOGE(sc, "Unknown EQ event %d, sc->state 0x%x\n", elem->message.opcode, sc->state); } next_spqe: spqe_cnt++; } /* for */ mb(); atomic_add_acq_long(&sc->eq_spq_left, spqe_cnt); sc->eq_cons = sw_cons; sc->eq_prod = sw_prod; /* make sure that above mem writes were issued towards the memory */ wmb(); /* update producer */ bxe_update_eq_prod(sc, sc->eq_prod); } static void bxe_handle_sp_tq(void *context, int pending) { struct bxe_softc *sc = (struct bxe_softc *)context; uint16_t status; BLOGD(sc, DBG_SP, "---> SP TASK <---\n"); /* what work needs to be performed? */ status = bxe_update_dsb_idx(sc); BLOGD(sc, DBG_SP, "dsb status 0x%04x\n", status); /* HW attentions */ if (status & BXE_DEF_SB_ATT_IDX) { BLOGD(sc, DBG_SP, "---> ATTN INTR <---\n"); bxe_attn_int(sc); status &= ~BXE_DEF_SB_ATT_IDX; } /* SP events: STAT_QUERY and others */ if (status & BXE_DEF_SB_IDX) { /* handle EQ completions */ BLOGD(sc, DBG_SP, "---> EQ INTR <---\n"); bxe_eq_int(sc); bxe_ack_sb(sc, sc->igu_dsb_id, USTORM_ID, le16toh(sc->def_idx), IGU_INT_NOP, 1); status &= ~BXE_DEF_SB_IDX; } /* if status is non zero then something went wrong */ if (__predict_false(status)) { BLOGE(sc, "Got an unknown SP interrupt! (0x%04x)\n", status); } /* ack status block only if something was actually handled */ bxe_ack_sb(sc, sc->igu_dsb_id, ATTENTION_ID, le16toh(sc->def_att_idx), IGU_INT_ENABLE, 1); /* * Must be called after the EQ processing (since eq leads to sriov * ramrod completion flows). * This flow may have been scheduled by the arrival of a ramrod * completion, or by the sriov code rescheduling itself. */ // XXX bxe_iov_sp_task(sc); #if 0 /* AFEX - poll to check if VIFSET_ACK should be sent to MFW */ if (bxe_test_and_clear_bit(ECORE_AFEX_PENDING_VIFSET_MCP_ACK, &sc->sp_state)) { bxe_link_report(sc); bxe_fw_command(sc, DRV_MSG_CODE_AFEX_VIFSET_ACK, 0); } #endif } static void bxe_handle_fp_tq(void *context, int pending) { struct bxe_fastpath *fp = (struct bxe_fastpath *)context; struct bxe_softc *sc = fp->sc; uint8_t more_tx = FALSE; uint8_t more_rx = FALSE; BLOGD(sc, DBG_INTR, "---> FP TASK QUEUE (%d) <---\n", fp->index); /* XXX * IFF_DRV_RUNNING state can't be checked here since we process * slowpath events on a client queue during setup. Instead * we need to add a "process/continue" flag here that the driver * can use to tell the task here not to do anything. */ #if 0 if (!(if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)) { return; } #endif /* update the fastpath index */ bxe_update_fp_sb_idx(fp); /* XXX add loop here if ever support multiple tx CoS */ /* fp->txdata[cos] */ if (bxe_has_tx_work(fp)) { BXE_FP_TX_LOCK(fp); more_tx = bxe_txeof(sc, fp); BXE_FP_TX_UNLOCK(fp); } if (bxe_has_rx_work(fp)) { more_rx = bxe_rxeof(sc, fp); } if (more_rx /*|| more_tx*/) { /* still more work to do */ taskqueue_enqueue_fast(fp->tq, &fp->tq_task); return; } bxe_ack_sb(sc, fp->igu_sb_id, USTORM_ID, le16toh(fp->fp_hc_idx), IGU_INT_ENABLE, 1); } static void bxe_task_fp(struct bxe_fastpath *fp) { struct bxe_softc *sc = fp->sc; uint8_t more_tx = FALSE; uint8_t more_rx = FALSE; BLOGD(sc, DBG_INTR, "---> FP TASK ISR (%d) <---\n", fp->index); /* update the fastpath index */ bxe_update_fp_sb_idx(fp); /* XXX add loop here if ever support multiple tx CoS */ /* fp->txdata[cos] */ if (bxe_has_tx_work(fp)) { BXE_FP_TX_LOCK(fp); more_tx = bxe_txeof(sc, fp); BXE_FP_TX_UNLOCK(fp); } if (bxe_has_rx_work(fp)) { more_rx = bxe_rxeof(sc, fp); } if (more_rx /*|| more_tx*/) { /* still more work to do, bail out if this ISR and process later */ taskqueue_enqueue_fast(fp->tq, &fp->tq_task); return; } /* * Here we write the fastpath index taken before doing any tx or rx work. * It is very well possible other hw events occurred up to this point and * they were actually processed accordingly above. Since we're going to * write an older fastpath index, an interrupt is coming which we might * not do any work in. */ bxe_ack_sb(sc, fp->igu_sb_id, USTORM_ID, le16toh(fp->fp_hc_idx), IGU_INT_ENABLE, 1); } /* * Legacy interrupt entry point. * * Verifies that the controller generated the interrupt and * then calls a separate routine to handle the various * interrupt causes: link, RX, and TX. */ static void bxe_intr_legacy(void *xsc) { struct bxe_softc *sc = (struct bxe_softc *)xsc; struct bxe_fastpath *fp; uint16_t status, mask; int i; BLOGD(sc, DBG_INTR, "---> BXE INTx <---\n"); #if 0 /* Don't handle any interrupts if we're not ready. */ if (__predict_false(sc->intr_sem != 0)) { return; } #endif /* * 0 for ustorm, 1 for cstorm * the bits returned from ack_int() are 0-15 * bit 0 = attention status block * bit 1 = fast path status block * a mask of 0x2 or more = tx/rx event * a mask of 1 = slow path event */ status = bxe_ack_int(sc); /* the interrupt is not for us */ if (__predict_false(status == 0)) { BLOGD(sc, DBG_INTR, "Not our interrupt!\n"); return; } BLOGD(sc, DBG_INTR, "Interrupt status 0x%04x\n", status); FOR_EACH_ETH_QUEUE(sc, i) { fp = &sc->fp[i]; mask = (0x2 << (fp->index + CNIC_SUPPORT(sc))); if (status & mask) { /* acknowledge and disable further fastpath interrupts */ bxe_ack_sb(sc, fp->igu_sb_id, USTORM_ID, 0, IGU_INT_DISABLE, 0); bxe_task_fp(fp); status &= ~mask; } } #if 0 if (CNIC_SUPPORT(sc)) { mask = 0x2; if (status & (mask | 0x1)) { ... status &= ~mask; } } #endif if (__predict_false(status & 0x1)) { /* acknowledge and disable further slowpath interrupts */ bxe_ack_sb(sc, sc->igu_dsb_id, USTORM_ID, 0, IGU_INT_DISABLE, 0); /* schedule slowpath handler */ taskqueue_enqueue_fast(sc->sp_tq, &sc->sp_tq_task); status &= ~0x1; } if (__predict_false(status)) { BLOGW(sc, "Unexpected fastpath status (0x%08x)!\n", status); } } /* slowpath interrupt entry point */ static void bxe_intr_sp(void *xsc) { struct bxe_softc *sc = (struct bxe_softc *)xsc; BLOGD(sc, (DBG_INTR | DBG_SP), "---> SP INTR <---\n"); /* acknowledge and disable further slowpath interrupts */ bxe_ack_sb(sc, sc->igu_dsb_id, USTORM_ID, 0, IGU_INT_DISABLE, 0); /* schedule slowpath handler */ taskqueue_enqueue_fast(sc->sp_tq, &sc->sp_tq_task); } /* fastpath interrupt entry point */ static void bxe_intr_fp(void *xfp) { struct bxe_fastpath *fp = (struct bxe_fastpath *)xfp; struct bxe_softc *sc = fp->sc; BLOGD(sc, DBG_INTR, "---> FP INTR %d <---\n", fp->index); BLOGD(sc, DBG_INTR, "(cpu=%d) MSI-X fp=%d fw_sb=%d igu_sb=%d\n", curcpu, fp->index, fp->fw_sb_id, fp->igu_sb_id); #if 0 /* Don't handle any interrupts if we're not ready. */ if (__predict_false(sc->intr_sem != 0)) { return; } #endif /* acknowledge and disable further fastpath interrupts */ bxe_ack_sb(sc, fp->igu_sb_id, USTORM_ID, 0, IGU_INT_DISABLE, 0); bxe_task_fp(fp); } /* Release all interrupts allocated by the driver. */ static void bxe_interrupt_free(struct bxe_softc *sc) { int i; switch (sc->interrupt_mode) { case INTR_MODE_INTX: BLOGD(sc, DBG_LOAD, "Releasing legacy INTx vector\n"); if (sc->intr[0].resource != NULL) { bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intr[0].rid, sc->intr[0].resource); } break; case INTR_MODE_MSI: for (i = 0; i < sc->intr_count; i++) { BLOGD(sc, DBG_LOAD, "Releasing MSI vector %d\n", i); if (sc->intr[i].resource && sc->intr[i].rid) { bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intr[i].rid, sc->intr[i].resource); } } pci_release_msi(sc->dev); break; case INTR_MODE_MSIX: for (i = 0; i < sc->intr_count; i++) { BLOGD(sc, DBG_LOAD, "Releasing MSI-X vector %d\n", i); if (sc->intr[i].resource && sc->intr[i].rid) { bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intr[i].rid, sc->intr[i].resource); } } pci_release_msi(sc->dev); break; default: /* nothing to do as initial allocation failed */ break; } } /* * This function determines and allocates the appropriate * interrupt based on system capabilites and user request. * * The user may force a particular interrupt mode, specify * the number of receive queues, specify the method for * distribuitng received frames to receive queues, or use * the default settings which will automatically select the * best supported combination. In addition, the OS may or * may not support certain combinations of these settings. * This routine attempts to reconcile the settings requested * by the user with the capabilites available from the system * to select the optimal combination of features. * * Returns: * 0 = Success, !0 = Failure. */ static int bxe_interrupt_alloc(struct bxe_softc *sc) { int msix_count = 0; int msi_count = 0; int num_requested = 0; int num_allocated = 0; int rid, i, j; int rc; /* get the number of available MSI/MSI-X interrupts from the OS */ if (sc->interrupt_mode > 0) { if (sc->devinfo.pcie_cap_flags & BXE_MSIX_CAPABLE_FLAG) { msix_count = pci_msix_count(sc->dev); } if (sc->devinfo.pcie_cap_flags & BXE_MSI_CAPABLE_FLAG) { msi_count = pci_msi_count(sc->dev); } BLOGD(sc, DBG_LOAD, "%d MSI and %d MSI-X vectors available\n", msi_count, msix_count); } do { /* try allocating MSI-X interrupt resources (at least 2) */ if (sc->interrupt_mode != INTR_MODE_MSIX) { break; } if (((sc->devinfo.pcie_cap_flags & BXE_MSIX_CAPABLE_FLAG) == 0) || (msix_count < 2)) { sc->interrupt_mode = INTR_MODE_MSI; /* try MSI next */ break; } /* ask for the necessary number of MSI-X vectors */ num_requested = min((sc->num_queues + 1), msix_count); BLOGD(sc, DBG_LOAD, "Requesting %d MSI-X vectors\n", num_requested); num_allocated = num_requested; if ((rc = pci_alloc_msix(sc->dev, &num_allocated)) != 0) { BLOGE(sc, "MSI-X alloc failed! (%d)\n", rc); sc->interrupt_mode = INTR_MODE_MSI; /* try MSI next */ break; } if (num_allocated < 2) { /* possible? */ BLOGE(sc, "MSI-X allocation less than 2!\n"); sc->interrupt_mode = INTR_MODE_MSI; /* try MSI next */ pci_release_msi(sc->dev); break; } BLOGI(sc, "MSI-X vectors Requested %d and Allocated %d\n", num_requested, num_allocated); /* best effort so use the number of vectors allocated to us */ sc->intr_count = num_allocated; sc->num_queues = num_allocated - 1; rid = 1; /* initial resource identifier */ /* allocate the MSI-X vectors */ for (i = 0; i < num_allocated; i++) { sc->intr[i].rid = (rid + i); if ((sc->intr[i].resource = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &sc->intr[i].rid, RF_ACTIVE)) == NULL) { BLOGE(sc, "Failed to map MSI-X[%d] (rid=%d)!\n", i, (rid + i)); for (j = (i - 1); j >= 0; j--) { bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intr[j].rid, sc->intr[j].resource); } sc->intr_count = 0; sc->num_queues = 0; sc->interrupt_mode = INTR_MODE_MSI; /* try MSI next */ pci_release_msi(sc->dev); break; } BLOGD(sc, DBG_LOAD, "Mapped MSI-X[%d] (rid=%d)\n", i, (rid + i)); } } while (0); do { /* try allocating MSI vector resources (at least 2) */ if (sc->interrupt_mode != INTR_MODE_MSI) { break; } if (((sc->devinfo.pcie_cap_flags & BXE_MSI_CAPABLE_FLAG) == 0) || (msi_count < 1)) { sc->interrupt_mode = INTR_MODE_INTX; /* try INTx next */ break; } /* ask for a single MSI vector */ num_requested = 1; BLOGD(sc, DBG_LOAD, "Requesting %d MSI vectors\n", num_requested); num_allocated = num_requested; if ((rc = pci_alloc_msi(sc->dev, &num_allocated)) != 0) { BLOGE(sc, "MSI alloc failed (%d)!\n", rc); sc->interrupt_mode = INTR_MODE_INTX; /* try INTx next */ break; } if (num_allocated != 1) { /* possible? */ BLOGE(sc, "MSI allocation is not 1!\n"); sc->interrupt_mode = INTR_MODE_INTX; /* try INTx next */ pci_release_msi(sc->dev); break; } BLOGI(sc, "MSI vectors Requested %d and Allocated %d\n", num_requested, num_allocated); /* best effort so use the number of vectors allocated to us */ sc->intr_count = num_allocated; sc->num_queues = num_allocated; rid = 1; /* initial resource identifier */ sc->intr[0].rid = rid; if ((sc->intr[0].resource = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &sc->intr[0].rid, RF_ACTIVE)) == NULL) { BLOGE(sc, "Failed to map MSI[0] (rid=%d)!\n", rid); sc->intr_count = 0; sc->num_queues = 0; sc->interrupt_mode = INTR_MODE_INTX; /* try INTx next */ pci_release_msi(sc->dev); break; } BLOGD(sc, DBG_LOAD, "Mapped MSI[0] (rid=%d)\n", rid); } while (0); do { /* try allocating INTx vector resources */ if (sc->interrupt_mode != INTR_MODE_INTX) { break; } BLOGD(sc, DBG_LOAD, "Requesting legacy INTx interrupt\n"); /* only one vector for INTx */ sc->intr_count = 1; sc->num_queues = 1; rid = 0; /* initial resource identifier */ sc->intr[0].rid = rid; if ((sc->intr[0].resource = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &sc->intr[0].rid, (RF_ACTIVE | RF_SHAREABLE))) == NULL) { BLOGE(sc, "Failed to map INTx (rid=%d)!\n", rid); sc->intr_count = 0; sc->num_queues = 0; sc->interrupt_mode = -1; /* Failed! */ break; } BLOGD(sc, DBG_LOAD, "Mapped INTx (rid=%d)\n", rid); } while (0); if (sc->interrupt_mode == -1) { BLOGE(sc, "Interrupt Allocation: FAILED!!!\n"); rc = 1; } else { BLOGD(sc, DBG_LOAD, "Interrupt Allocation: interrupt_mode=%d, num_queues=%d\n", sc->interrupt_mode, sc->num_queues); rc = 0; } return (rc); } static void bxe_interrupt_detach(struct bxe_softc *sc) { struct bxe_fastpath *fp; int i; /* release interrupt resources */ for (i = 0; i < sc->intr_count; i++) { if (sc->intr[i].resource && sc->intr[i].tag) { BLOGD(sc, DBG_LOAD, "Disabling interrupt vector %d\n", i); bus_teardown_intr(sc->dev, sc->intr[i].resource, sc->intr[i].tag); } } for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; if (fp->tq) { taskqueue_drain(fp->tq, &fp->tq_task); taskqueue_free(fp->tq); fp->tq = NULL; } } if (sc->rx_mode_tq) { taskqueue_drain(sc->rx_mode_tq, &sc->rx_mode_tq_task); taskqueue_free(sc->rx_mode_tq); sc->rx_mode_tq = NULL; } if (sc->sp_tq) { taskqueue_drain(sc->sp_tq, &sc->sp_tq_task); taskqueue_free(sc->sp_tq); sc->sp_tq = NULL; } } /* * Enables interrupts and attach to the ISR. * * When using multiple MSI/MSI-X vectors the first vector * is used for slowpath operations while all remaining * vectors are used for fastpath operations. If only a * single MSI/MSI-X vector is used (SINGLE_ISR) then the * ISR must look for both slowpath and fastpath completions. */ static int bxe_interrupt_attach(struct bxe_softc *sc) { struct bxe_fastpath *fp; int rc = 0; int i; snprintf(sc->sp_tq_name, sizeof(sc->sp_tq_name), "bxe%d_sp_tq", sc->unit); TASK_INIT(&sc->sp_tq_task, 0, bxe_handle_sp_tq, sc); sc->sp_tq = taskqueue_create_fast(sc->sp_tq_name, M_NOWAIT, taskqueue_thread_enqueue, &sc->sp_tq); taskqueue_start_threads(&sc->sp_tq, 1, PWAIT, /* lower priority */ "%s", sc->sp_tq_name); snprintf(sc->rx_mode_tq_name, sizeof(sc->rx_mode_tq_name), "bxe%d_rx_mode_tq", sc->unit); TASK_INIT(&sc->rx_mode_tq_task, 0, bxe_handle_rx_mode_tq, sc); sc->rx_mode_tq = taskqueue_create_fast(sc->rx_mode_tq_name, M_NOWAIT, taskqueue_thread_enqueue, &sc->rx_mode_tq); taskqueue_start_threads(&sc->rx_mode_tq, 1, PWAIT, /* lower priority */ "%s", sc->rx_mode_tq_name); for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; snprintf(fp->tq_name, sizeof(fp->tq_name), "bxe%d_fp%d_tq", sc->unit, i); TASK_INIT(&fp->tq_task, 0, bxe_handle_fp_tq, fp); fp->tq = taskqueue_create_fast(fp->tq_name, M_NOWAIT, taskqueue_thread_enqueue, &fp->tq); taskqueue_start_threads(&fp->tq, 1, PI_NET, /* higher priority */ "%s", fp->tq_name); } /* setup interrupt handlers */ if (sc->interrupt_mode == INTR_MODE_MSIX) { BLOGD(sc, DBG_LOAD, "Enabling slowpath MSI-X[0] vector\n"); /* * Setup the interrupt handler. Note that we pass the driver instance * to the interrupt handler for the slowpath. */ if ((rc = bus_setup_intr(sc->dev, sc->intr[0].resource, (INTR_TYPE_NET | INTR_MPSAFE), NULL, bxe_intr_sp, sc, &sc->intr[0].tag)) != 0) { BLOGE(sc, "Failed to allocate MSI-X[0] vector (%d)\n", rc); goto bxe_interrupt_attach_exit; } bus_describe_intr(sc->dev, sc->intr[0].resource, sc->intr[0].tag, "sp"); /* bus_bind_intr(sc->dev, sc->intr[0].resource, 0); */ /* initialize the fastpath vectors (note the first was used for sp) */ for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; BLOGD(sc, DBG_LOAD, "Enabling MSI-X[%d] vector\n", (i + 1)); /* * Setup the interrupt handler. Note that we pass the * fastpath context to the interrupt handler in this * case. */ if ((rc = bus_setup_intr(sc->dev, sc->intr[i + 1].resource, (INTR_TYPE_NET | INTR_MPSAFE), NULL, bxe_intr_fp, fp, &sc->intr[i + 1].tag)) != 0) { BLOGE(sc, "Failed to allocate MSI-X[%d] vector (%d)\n", (i + 1), rc); goto bxe_interrupt_attach_exit; } bus_describe_intr(sc->dev, sc->intr[i + 1].resource, sc->intr[i + 1].tag, "fp%02d", i); /* bind the fastpath instance to a cpu */ if (sc->num_queues > 1) { bus_bind_intr(sc->dev, sc->intr[i + 1].resource, i); } fp->state = BXE_FP_STATE_IRQ; } } else if (sc->interrupt_mode == INTR_MODE_MSI) { BLOGD(sc, DBG_LOAD, "Enabling MSI[0] vector\n"); /* * Setup the interrupt handler. Note that we pass the * driver instance to the interrupt handler which * will handle both the slowpath and fastpath. */ if ((rc = bus_setup_intr(sc->dev, sc->intr[0].resource, (INTR_TYPE_NET | INTR_MPSAFE), NULL, bxe_intr_legacy, sc, &sc->intr[0].tag)) != 0) { BLOGE(sc, "Failed to allocate MSI[0] vector (%d)\n", rc); goto bxe_interrupt_attach_exit; } } else { /* (sc->interrupt_mode == INTR_MODE_INTX) */ BLOGD(sc, DBG_LOAD, "Enabling INTx interrupts\n"); /* * Setup the interrupt handler. Note that we pass the * driver instance to the interrupt handler which * will handle both the slowpath and fastpath. */ if ((rc = bus_setup_intr(sc->dev, sc->intr[0].resource, (INTR_TYPE_NET | INTR_MPSAFE), NULL, bxe_intr_legacy, sc, &sc->intr[0].tag)) != 0) { BLOGE(sc, "Failed to allocate INTx interrupt (%d)\n", rc); goto bxe_interrupt_attach_exit; } } bxe_interrupt_attach_exit: return (rc); } static int bxe_init_hw_common_chip(struct bxe_softc *sc); static int bxe_init_hw_common(struct bxe_softc *sc); static int bxe_init_hw_port(struct bxe_softc *sc); static int bxe_init_hw_func(struct bxe_softc *sc); static void bxe_reset_common(struct bxe_softc *sc); static void bxe_reset_port(struct bxe_softc *sc); static void bxe_reset_func(struct bxe_softc *sc); static int bxe_gunzip_init(struct bxe_softc *sc); static void bxe_gunzip_end(struct bxe_softc *sc); static int bxe_init_firmware(struct bxe_softc *sc); static void bxe_release_firmware(struct bxe_softc *sc); static struct ecore_func_sp_drv_ops bxe_func_sp_drv = { .init_hw_cmn_chip = bxe_init_hw_common_chip, .init_hw_cmn = bxe_init_hw_common, .init_hw_port = bxe_init_hw_port, .init_hw_func = bxe_init_hw_func, .reset_hw_cmn = bxe_reset_common, .reset_hw_port = bxe_reset_port, .reset_hw_func = bxe_reset_func, .gunzip_init = bxe_gunzip_init, .gunzip_end = bxe_gunzip_end, .init_fw = bxe_init_firmware, .release_fw = bxe_release_firmware, }; static void bxe_init_func_obj(struct bxe_softc *sc) { sc->dmae_ready = 0; ecore_init_func_obj(sc, &sc->func_obj, BXE_SP(sc, func_rdata), BXE_SP_MAPPING(sc, func_rdata), BXE_SP(sc, func_afex_rdata), BXE_SP_MAPPING(sc, func_afex_rdata), &bxe_func_sp_drv); } static int bxe_init_hw(struct bxe_softc *sc, uint32_t load_code) { struct ecore_func_state_params func_params = { NULL }; int rc; /* prepare the parameters for function state transitions */ bit_set(&func_params.ramrod_flags, RAMROD_COMP_WAIT); func_params.f_obj = &sc->func_obj; func_params.cmd = ECORE_F_CMD_HW_INIT; func_params.params.hw_init.load_phase = load_code; /* * Via a plethora of function pointers, we will eventually reach * bxe_init_hw_common(), bxe_init_hw_port(), or bxe_init_hw_func(). */ rc = ecore_func_state_change(sc, &func_params); return (rc); } static void bxe_fill(struct bxe_softc *sc, uint32_t addr, int fill, uint32_t len) { uint32_t i; if (!(len % 4) && !(addr % 4)) { for (i = 0; i < len; i += 4) { REG_WR(sc, (addr + i), fill); } } else { for (i = 0; i < len; i++) { REG_WR8(sc, (addr + i), fill); } } } /* writes FP SP data to FW - data_size in dwords */ static void bxe_wr_fp_sb_data(struct bxe_softc *sc, int fw_sb_id, uint32_t *sb_data_p, uint32_t data_size) { int index; for (index = 0; index < data_size; index++) { REG_WR(sc, (BAR_CSTRORM_INTMEM + CSTORM_STATUS_BLOCK_DATA_OFFSET(fw_sb_id) + (sizeof(uint32_t) * index)), *(sb_data_p + index)); } } static void bxe_zero_fp_sb(struct bxe_softc *sc, int fw_sb_id) { struct hc_status_block_data_e2 sb_data_e2; struct hc_status_block_data_e1x sb_data_e1x; uint32_t *sb_data_p; uint32_t data_size = 0; if (!CHIP_IS_E1x(sc)) { memset(&sb_data_e2, 0, sizeof(struct hc_status_block_data_e2)); sb_data_e2.common.state = SB_DISABLED; sb_data_e2.common.p_func.vf_valid = FALSE; sb_data_p = (uint32_t *)&sb_data_e2; data_size = (sizeof(struct hc_status_block_data_e2) / sizeof(uint32_t)); } else { memset(&sb_data_e1x, 0, sizeof(struct hc_status_block_data_e1x)); sb_data_e1x.common.state = SB_DISABLED; sb_data_e1x.common.p_func.vf_valid = FALSE; sb_data_p = (uint32_t *)&sb_data_e1x; data_size = (sizeof(struct hc_status_block_data_e1x) / sizeof(uint32_t)); } bxe_wr_fp_sb_data(sc, fw_sb_id, sb_data_p, data_size); bxe_fill(sc, (BAR_CSTRORM_INTMEM + CSTORM_STATUS_BLOCK_OFFSET(fw_sb_id)), 0, CSTORM_STATUS_BLOCK_SIZE); bxe_fill(sc, (BAR_CSTRORM_INTMEM + CSTORM_SYNC_BLOCK_OFFSET(fw_sb_id)), 0, CSTORM_SYNC_BLOCK_SIZE); } static void bxe_wr_sp_sb_data(struct bxe_softc *sc, struct hc_sp_status_block_data *sp_sb_data) { int i; for (i = 0; i < (sizeof(struct hc_sp_status_block_data) / sizeof(uint32_t)); i++) { REG_WR(sc, (BAR_CSTRORM_INTMEM + CSTORM_SP_STATUS_BLOCK_DATA_OFFSET(SC_FUNC(sc)) + (i * sizeof(uint32_t))), *((uint32_t *)sp_sb_data + i)); } } static void bxe_zero_sp_sb(struct bxe_softc *sc) { struct hc_sp_status_block_data sp_sb_data; memset(&sp_sb_data, 0, sizeof(struct hc_sp_status_block_data)); sp_sb_data.state = SB_DISABLED; sp_sb_data.p_func.vf_valid = FALSE; bxe_wr_sp_sb_data(sc, &sp_sb_data); bxe_fill(sc, (BAR_CSTRORM_INTMEM + CSTORM_SP_STATUS_BLOCK_OFFSET(SC_FUNC(sc))), 0, CSTORM_SP_STATUS_BLOCK_SIZE); bxe_fill(sc, (BAR_CSTRORM_INTMEM + CSTORM_SP_SYNC_BLOCK_OFFSET(SC_FUNC(sc))), 0, CSTORM_SP_SYNC_BLOCK_SIZE); } static void bxe_setup_ndsb_state_machine(struct hc_status_block_sm *hc_sm, int igu_sb_id, int igu_seg_id) { hc_sm->igu_sb_id = igu_sb_id; hc_sm->igu_seg_id = igu_seg_id; hc_sm->timer_value = 0xFF; hc_sm->time_to_expire = 0xFFFFFFFF; } static void bxe_map_sb_state_machines(struct hc_index_data *index_data) { /* zero out state machine indices */ /* rx indices */ index_data[HC_INDEX_ETH_RX_CQ_CONS].flags &= ~HC_INDEX_DATA_SM_ID; /* tx indices */ index_data[HC_INDEX_OOO_TX_CQ_CONS].flags &= ~HC_INDEX_DATA_SM_ID; index_data[HC_INDEX_ETH_TX_CQ_CONS_COS0].flags &= ~HC_INDEX_DATA_SM_ID; index_data[HC_INDEX_ETH_TX_CQ_CONS_COS1].flags &= ~HC_INDEX_DATA_SM_ID; index_data[HC_INDEX_ETH_TX_CQ_CONS_COS2].flags &= ~HC_INDEX_DATA_SM_ID; /* map indices */ /* rx indices */ index_data[HC_INDEX_ETH_RX_CQ_CONS].flags |= (SM_RX_ID << HC_INDEX_DATA_SM_ID_SHIFT); /* tx indices */ index_data[HC_INDEX_OOO_TX_CQ_CONS].flags |= (SM_TX_ID << HC_INDEX_DATA_SM_ID_SHIFT); index_data[HC_INDEX_ETH_TX_CQ_CONS_COS0].flags |= (SM_TX_ID << HC_INDEX_DATA_SM_ID_SHIFT); index_data[HC_INDEX_ETH_TX_CQ_CONS_COS1].flags |= (SM_TX_ID << HC_INDEX_DATA_SM_ID_SHIFT); index_data[HC_INDEX_ETH_TX_CQ_CONS_COS2].flags |= (SM_TX_ID << HC_INDEX_DATA_SM_ID_SHIFT); } static void bxe_init_sb(struct bxe_softc *sc, bus_addr_t busaddr, int vfid, uint8_t vf_valid, int fw_sb_id, int igu_sb_id) { struct hc_status_block_data_e2 sb_data_e2; struct hc_status_block_data_e1x sb_data_e1x; struct hc_status_block_sm *hc_sm_p; uint32_t *sb_data_p; int igu_seg_id; int data_size; if (CHIP_INT_MODE_IS_BC(sc)) { igu_seg_id = HC_SEG_ACCESS_NORM; } else { igu_seg_id = IGU_SEG_ACCESS_NORM; } bxe_zero_fp_sb(sc, fw_sb_id); if (!CHIP_IS_E1x(sc)) { memset(&sb_data_e2, 0, sizeof(struct hc_status_block_data_e2)); sb_data_e2.common.state = SB_ENABLED; sb_data_e2.common.p_func.pf_id = SC_FUNC(sc); sb_data_e2.common.p_func.vf_id = vfid; sb_data_e2.common.p_func.vf_valid = vf_valid; sb_data_e2.common.p_func.vnic_id = SC_VN(sc); sb_data_e2.common.same_igu_sb_1b = TRUE; sb_data_e2.common.host_sb_addr.hi = U64_HI(busaddr); sb_data_e2.common.host_sb_addr.lo = U64_LO(busaddr); hc_sm_p = sb_data_e2.common.state_machine; sb_data_p = (uint32_t *)&sb_data_e2; data_size = (sizeof(struct hc_status_block_data_e2) / sizeof(uint32_t)); bxe_map_sb_state_machines(sb_data_e2.index_data); } else { memset(&sb_data_e1x, 0, sizeof(struct hc_status_block_data_e1x)); sb_data_e1x.common.state = SB_ENABLED; sb_data_e1x.common.p_func.pf_id = SC_FUNC(sc); sb_data_e1x.common.p_func.vf_id = 0xff; sb_data_e1x.common.p_func.vf_valid = FALSE; sb_data_e1x.common.p_func.vnic_id = SC_VN(sc); sb_data_e1x.common.same_igu_sb_1b = TRUE; sb_data_e1x.common.host_sb_addr.hi = U64_HI(busaddr); sb_data_e1x.common.host_sb_addr.lo = U64_LO(busaddr); hc_sm_p = sb_data_e1x.common.state_machine; sb_data_p = (uint32_t *)&sb_data_e1x; data_size = (sizeof(struct hc_status_block_data_e1x) / sizeof(uint32_t)); bxe_map_sb_state_machines(sb_data_e1x.index_data); } bxe_setup_ndsb_state_machine(&hc_sm_p[SM_RX_ID], igu_sb_id, igu_seg_id); bxe_setup_ndsb_state_machine(&hc_sm_p[SM_TX_ID], igu_sb_id, igu_seg_id); BLOGD(sc, DBG_LOAD, "Init FW SB %d\n", fw_sb_id); /* write indices to HW - PCI guarantees endianity of regpairs */ bxe_wr_fp_sb_data(sc, fw_sb_id, sb_data_p, data_size); } static inline uint8_t bxe_fp_qzone_id(struct bxe_fastpath *fp) { if (CHIP_IS_E1x(fp->sc)) { return (fp->cl_id + SC_PORT(fp->sc) * ETH_MAX_RX_CLIENTS_E1H); } else { return (fp->cl_id); } } static inline uint32_t bxe_rx_ustorm_prods_offset(struct bxe_softc *sc, struct bxe_fastpath *fp) { uint32_t offset = BAR_USTRORM_INTMEM; #if 0 if (IS_VF(sc)) { return (PXP_VF_ADDR_USDM_QUEUES_START + (sc->acquire_resp.resc.hw_qid[fp->index] * sizeof(struct ustorm_queue_zone_data))); } else #endif if (!CHIP_IS_E1x(sc)) { offset += USTORM_RX_PRODS_E2_OFFSET(fp->cl_qzone_id); } else { offset += USTORM_RX_PRODS_E1X_OFFSET(SC_PORT(sc), fp->cl_id); } return (offset); } static void bxe_init_eth_fp(struct bxe_softc *sc, int idx) { struct bxe_fastpath *fp = &sc->fp[idx]; uint32_t cids[ECORE_MULTI_TX_COS] = { 0 }; unsigned long q_type = 0; int cos; fp->sc = sc; fp->index = idx; snprintf(fp->tx_mtx_name, sizeof(fp->tx_mtx_name), "bxe%d_fp%d_tx_lock", sc->unit, idx); mtx_init(&fp->tx_mtx, fp->tx_mtx_name, NULL, MTX_DEF); snprintf(fp->rx_mtx_name, sizeof(fp->rx_mtx_name), "bxe%d_fp%d_rx_lock", sc->unit, idx); mtx_init(&fp->rx_mtx, fp->rx_mtx_name, NULL, MTX_DEF); fp->igu_sb_id = (sc->igu_base_sb + idx + CNIC_SUPPORT(sc)); fp->fw_sb_id = (sc->base_fw_ndsb + idx + CNIC_SUPPORT(sc)); fp->cl_id = (CHIP_IS_E1x(sc)) ? (SC_L_ID(sc) + idx) : /* want client ID same as IGU SB ID for non-E1 */ fp->igu_sb_id; fp->cl_qzone_id = bxe_fp_qzone_id(fp); /* setup sb indices */ if (!CHIP_IS_E1x(sc)) { fp->sb_index_values = fp->status_block.e2_sb->sb.index_values; fp->sb_running_index = fp->status_block.e2_sb->sb.running_index; } else { fp->sb_index_values = fp->status_block.e1x_sb->sb.index_values; fp->sb_running_index = fp->status_block.e1x_sb->sb.running_index; } /* init shortcut */ fp->ustorm_rx_prods_offset = bxe_rx_ustorm_prods_offset(sc, fp); fp->rx_cq_cons_sb = &fp->sb_index_values[HC_INDEX_ETH_RX_CQ_CONS]; /* * XXX If multiple CoS is ever supported then each fastpath structure * will need to maintain tx producer/consumer/dma/etc values *per* CoS. */ for (cos = 0; cos < sc->max_cos; cos++) { cids[cos] = idx; } fp->tx_cons_sb = &fp->sb_index_values[HC_INDEX_ETH_TX_CQ_CONS_COS0]; /* nothing more for a VF to do */ if (IS_VF(sc)) { return; } bxe_init_sb(sc, fp->sb_dma.paddr, BXE_VF_ID_INVALID, FALSE, fp->fw_sb_id, fp->igu_sb_id); bxe_update_fp_sb_idx(fp); /* Configure Queue State object */ bit_set(&q_type, ECORE_Q_TYPE_HAS_RX); bit_set(&q_type, ECORE_Q_TYPE_HAS_TX); ecore_init_queue_obj(sc, &sc->sp_objs[idx].q_obj, fp->cl_id, cids, sc->max_cos, SC_FUNC(sc), BXE_SP(sc, q_rdata), BXE_SP_MAPPING(sc, q_rdata), q_type); /* configure classification DBs */ ecore_init_mac_obj(sc, &sc->sp_objs[idx].mac_obj, fp->cl_id, idx, SC_FUNC(sc), BXE_SP(sc, mac_rdata), BXE_SP_MAPPING(sc, mac_rdata), ECORE_FILTER_MAC_PENDING, &sc->sp_state, ECORE_OBJ_TYPE_RX_TX, &sc->macs_pool); BLOGD(sc, DBG_LOAD, "fp[%d]: sb=%p cl_id=%d fw_sb=%d igu_sb=%d\n", idx, fp->status_block.e2_sb, fp->cl_id, fp->fw_sb_id, fp->igu_sb_id); } static inline void bxe_update_rx_prod(struct bxe_softc *sc, struct bxe_fastpath *fp, uint16_t rx_bd_prod, uint16_t rx_cq_prod, uint16_t rx_sge_prod) { struct ustorm_eth_rx_producers rx_prods = { 0 }; uint32_t i; /* update producers */ rx_prods.bd_prod = rx_bd_prod; rx_prods.cqe_prod = rx_cq_prod; rx_prods.sge_prod = rx_sge_prod; /* * Make sure that the BD and SGE data is updated before updating the * producers since FW might read the BD/SGE right after the producer * is updated. * This is only applicable for weak-ordered memory model archs such * as IA-64. The following barrier is also mandatory since FW will * assumes BDs must have buffers. */ wmb(); for (i = 0; i < (sizeof(rx_prods) / 4); i++) { REG_WR(sc, (fp->ustorm_rx_prods_offset + (i * 4)), ((uint32_t *)&rx_prods)[i]); } wmb(); /* keep prod updates ordered */ BLOGD(sc, DBG_RX, "RX fp[%d]: wrote prods bd_prod=%u cqe_prod=%u sge_prod=%u\n", fp->index, rx_bd_prod, rx_cq_prod, rx_sge_prod); } static void bxe_init_rx_rings(struct bxe_softc *sc) { struct bxe_fastpath *fp; int i; for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; fp->rx_bd_cons = 0; /* * Activate the BD ring... * Warning, this will generate an interrupt (to the TSTORM) * so this can only be done after the chip is initialized */ bxe_update_rx_prod(sc, fp, fp->rx_bd_prod, fp->rx_cq_prod, fp->rx_sge_prod); if (i != 0) { continue; } if (CHIP_IS_E1(sc)) { REG_WR(sc, (BAR_USTRORM_INTMEM + USTORM_MEM_WORKAROUND_ADDRESS_OFFSET(SC_FUNC(sc))), U64_LO(fp->rcq_dma.paddr)); REG_WR(sc, (BAR_USTRORM_INTMEM + USTORM_MEM_WORKAROUND_ADDRESS_OFFSET(SC_FUNC(sc)) + 4), U64_HI(fp->rcq_dma.paddr)); } } } static void bxe_init_tx_ring_one(struct bxe_fastpath *fp) { SET_FLAG(fp->tx_db.data.header.header, DOORBELL_HDR_DB_TYPE, 1); fp->tx_db.data.zero_fill1 = 0; fp->tx_db.data.prod = 0; fp->tx_pkt_prod = 0; fp->tx_pkt_cons = 0; fp->tx_bd_prod = 0; fp->tx_bd_cons = 0; fp->eth_q_stats.tx_pkts = 0; } static inline void bxe_init_tx_rings(struct bxe_softc *sc) { int i; for (i = 0; i < sc->num_queues; i++) { #if 0 uint8_t cos; for (cos = 0; cos < sc->max_cos; cos++) { bxe_init_tx_ring_one(&sc->fp[i].txdata[cos]); } #else bxe_init_tx_ring_one(&sc->fp[i]); #endif } } static void bxe_init_def_sb(struct bxe_softc *sc) { struct host_sp_status_block *def_sb = sc->def_sb; bus_addr_t mapping = sc->def_sb_dma.paddr; int igu_sp_sb_index; int igu_seg_id; int port = SC_PORT(sc); int func = SC_FUNC(sc); int reg_offset, reg_offset_en5; uint64_t section; int index, sindex; struct hc_sp_status_block_data sp_sb_data; memset(&sp_sb_data, 0, sizeof(struct hc_sp_status_block_data)); if (CHIP_INT_MODE_IS_BC(sc)) { igu_sp_sb_index = DEF_SB_IGU_ID; igu_seg_id = HC_SEG_ACCESS_DEF; } else { igu_sp_sb_index = sc->igu_dsb_id; igu_seg_id = IGU_SEG_ACCESS_DEF; } /* attentions */ section = ((uint64_t)mapping + offsetof(struct host_sp_status_block, atten_status_block)); def_sb->atten_status_block.status_block_id = igu_sp_sb_index; sc->attn_state = 0; reg_offset = (port) ? MISC_REG_AEU_ENABLE1_FUNC_1_OUT_0 : MISC_REG_AEU_ENABLE1_FUNC_0_OUT_0; reg_offset_en5 = (port) ? MISC_REG_AEU_ENABLE5_FUNC_1_OUT_0 : MISC_REG_AEU_ENABLE5_FUNC_0_OUT_0; for (index = 0; index < MAX_DYNAMIC_ATTN_GRPS; index++) { /* take care of sig[0]..sig[4] */ for (sindex = 0; sindex < 4; sindex++) { sc->attn_group[index].sig[sindex] = REG_RD(sc, (reg_offset + (sindex * 0x4) + (0x10 * index))); } if (!CHIP_IS_E1x(sc)) { /* * enable5 is separate from the rest of the registers, * and the address skip is 4 and not 16 between the * different groups */ sc->attn_group[index].sig[4] = REG_RD(sc, (reg_offset_en5 + (0x4 * index))); } else { sc->attn_group[index].sig[4] = 0; } } if (sc->devinfo.int_block == INT_BLOCK_HC) { reg_offset = (port) ? HC_REG_ATTN_MSG1_ADDR_L : HC_REG_ATTN_MSG0_ADDR_L; REG_WR(sc, reg_offset, U64_LO(section)); REG_WR(sc, (reg_offset + 4), U64_HI(section)); } else if (!CHIP_IS_E1x(sc)) { REG_WR(sc, IGU_REG_ATTN_MSG_ADDR_L, U64_LO(section)); REG_WR(sc, IGU_REG_ATTN_MSG_ADDR_H, U64_HI(section)); } section = ((uint64_t)mapping + offsetof(struct host_sp_status_block, sp_sb)); bxe_zero_sp_sb(sc); /* PCI guarantees endianity of regpair */ sp_sb_data.state = SB_ENABLED; sp_sb_data.host_sb_addr.lo = U64_LO(section); sp_sb_data.host_sb_addr.hi = U64_HI(section); sp_sb_data.igu_sb_id = igu_sp_sb_index; sp_sb_data.igu_seg_id = igu_seg_id; sp_sb_data.p_func.pf_id = func; sp_sb_data.p_func.vnic_id = SC_VN(sc); sp_sb_data.p_func.vf_id = 0xff; bxe_wr_sp_sb_data(sc, &sp_sb_data); bxe_ack_sb(sc, sc->igu_dsb_id, USTORM_ID, 0, IGU_INT_ENABLE, 0); } static void bxe_init_sp_ring(struct bxe_softc *sc) { atomic_store_rel_long(&sc->cq_spq_left, MAX_SPQ_PENDING); sc->spq_prod_idx = 0; sc->dsb_sp_prod = &sc->def_sb->sp_sb.index_values[HC_SP_INDEX_ETH_DEF_CONS]; sc->spq_prod_bd = sc->spq; sc->spq_last_bd = (sc->spq_prod_bd + MAX_SP_DESC_CNT); } static void bxe_init_eq_ring(struct bxe_softc *sc) { union event_ring_elem *elem; int i; for (i = 1; i <= NUM_EQ_PAGES; i++) { elem = &sc->eq[EQ_DESC_CNT_PAGE * i - 1]; elem->next_page.addr.hi = htole32(U64_HI(sc->eq_dma.paddr + BCM_PAGE_SIZE * (i % NUM_EQ_PAGES))); elem->next_page.addr.lo = htole32(U64_LO(sc->eq_dma.paddr + BCM_PAGE_SIZE * (i % NUM_EQ_PAGES))); } sc->eq_cons = 0; sc->eq_prod = NUM_EQ_DESC; sc->eq_cons_sb = &sc->def_sb->sp_sb.index_values[HC_SP_INDEX_EQ_CONS]; atomic_store_rel_long(&sc->eq_spq_left, (min((MAX_SP_DESC_CNT - MAX_SPQ_PENDING), NUM_EQ_DESC) - 1)); } static void bxe_init_internal_common(struct bxe_softc *sc) { int i; if (IS_MF_SI(sc)) { /* * In switch independent mode, the TSTORM needs to accept * packets that failed classification, since approximate match * mac addresses aren't written to NIG LLH. */ REG_WR8(sc, (BAR_TSTRORM_INTMEM + TSTORM_ACCEPT_CLASSIFY_FAILED_OFFSET), 2); } else if (!CHIP_IS_E1(sc)) { /* 57710 doesn't support MF */ REG_WR8(sc, (BAR_TSTRORM_INTMEM + TSTORM_ACCEPT_CLASSIFY_FAILED_OFFSET), 0); } /* * Zero this manually as its initialization is currently missing * in the initTool. */ for (i = 0; i < (USTORM_AGG_DATA_SIZE >> 2); i++) { REG_WR(sc, (BAR_USTRORM_INTMEM + USTORM_AGG_DATA_OFFSET + (i * 4)), 0); } if (!CHIP_IS_E1x(sc)) { REG_WR8(sc, (BAR_CSTRORM_INTMEM + CSTORM_IGU_MODE_OFFSET), CHIP_INT_MODE_IS_BC(sc) ? HC_IGU_BC_MODE : HC_IGU_NBC_MODE); } } static void bxe_init_internal(struct bxe_softc *sc, uint32_t load_code) { switch (load_code) { case FW_MSG_CODE_DRV_LOAD_COMMON: case FW_MSG_CODE_DRV_LOAD_COMMON_CHIP: bxe_init_internal_common(sc); /* no break */ case FW_MSG_CODE_DRV_LOAD_PORT: /* nothing to do */ /* no break */ case FW_MSG_CODE_DRV_LOAD_FUNCTION: /* internal memory per function is initialized inside bxe_pf_init */ break; default: BLOGE(sc, "Unknown load_code (0x%x) from MCP\n", load_code); break; } } static void storm_memset_func_cfg(struct bxe_softc *sc, struct tstorm_eth_function_common_config *tcfg, uint16_t abs_fid) { uint32_t addr; size_t size; addr = (BAR_TSTRORM_INTMEM + TSTORM_FUNCTION_COMMON_CONFIG_OFFSET(abs_fid)); size = sizeof(struct tstorm_eth_function_common_config); ecore_storm_memset_struct(sc, addr, size, (uint32_t *)tcfg); } static void bxe_func_init(struct bxe_softc *sc, struct bxe_func_init_params *p) { struct tstorm_eth_function_common_config tcfg = { 0 }; if (CHIP_IS_E1x(sc)) { storm_memset_func_cfg(sc, &tcfg, p->func_id); } /* Enable the function in the FW */ storm_memset_vf_to_pf(sc, p->func_id, p->pf_id); storm_memset_func_en(sc, p->func_id, 1); /* spq */ if (p->func_flgs & FUNC_FLG_SPQ) { storm_memset_spq_addr(sc, p->spq_map, p->func_id); REG_WR(sc, (XSEM_REG_FAST_MEMORY + XSTORM_SPQ_PROD_OFFSET(p->func_id)), p->spq_prod); } } /* * Calculates the sum of vn_min_rates. * It's needed for further normalizing of the min_rates. * Returns: * sum of vn_min_rates. * or * 0 - if all the min_rates are 0. * In the later case fainess algorithm should be deactivated. * If all min rates are not zero then those that are zeroes will be set to 1. */ static void bxe_calc_vn_min(struct bxe_softc *sc, struct cmng_init_input *input) { uint32_t vn_cfg; uint32_t vn_min_rate; int all_zero = 1; int vn; for (vn = VN_0; vn < SC_MAX_VN_NUM(sc); vn++) { vn_cfg = sc->devinfo.mf_info.mf_config[vn]; vn_min_rate = (((vn_cfg & FUNC_MF_CFG_MIN_BW_MASK) >> FUNC_MF_CFG_MIN_BW_SHIFT) * 100); if (vn_cfg & FUNC_MF_CFG_FUNC_HIDE) { /* skip hidden VNs */ vn_min_rate = 0; } else if (!vn_min_rate) { /* If min rate is zero - set it to 100 */ vn_min_rate = DEF_MIN_RATE; } else { all_zero = 0; } input->vnic_min_rate[vn] = vn_min_rate; } /* if ETS or all min rates are zeros - disable fairness */ if (BXE_IS_ETS_ENABLED(sc)) { input->flags.cmng_enables &= ~CMNG_FLAGS_PER_PORT_FAIRNESS_VN; BLOGD(sc, DBG_LOAD, "Fairness disabled (ETS)\n"); } else if (all_zero) { input->flags.cmng_enables &= ~CMNG_FLAGS_PER_PORT_FAIRNESS_VN; BLOGD(sc, DBG_LOAD, "Fariness disabled (all MIN values are zeroes)\n"); } else { input->flags.cmng_enables |= CMNG_FLAGS_PER_PORT_FAIRNESS_VN; } } static inline uint16_t bxe_extract_max_cfg(struct bxe_softc *sc, uint32_t mf_cfg) { uint16_t max_cfg = ((mf_cfg & FUNC_MF_CFG_MAX_BW_MASK) >> FUNC_MF_CFG_MAX_BW_SHIFT); if (!max_cfg) { BLOGD(sc, DBG_LOAD, "Max BW configured to 0 - using 100 instead\n"); max_cfg = 100; } return (max_cfg); } static void bxe_calc_vn_max(struct bxe_softc *sc, int vn, struct cmng_init_input *input) { uint16_t vn_max_rate; uint32_t vn_cfg = sc->devinfo.mf_info.mf_config[vn]; uint32_t max_cfg; if (vn_cfg & FUNC_MF_CFG_FUNC_HIDE) { vn_max_rate = 0; } else { max_cfg = bxe_extract_max_cfg(sc, vn_cfg); if (IS_MF_SI(sc)) { /* max_cfg in percents of linkspeed */ vn_max_rate = ((sc->link_vars.line_speed * max_cfg) / 100); } else { /* SD modes */ /* max_cfg is absolute in 100Mb units */ vn_max_rate = (max_cfg * 100); } } BLOGD(sc, DBG_LOAD, "vn %d: vn_max_rate %d\n", vn, vn_max_rate); input->vnic_max_rate[vn] = vn_max_rate; } static void bxe_cmng_fns_init(struct bxe_softc *sc, uint8_t read_cfg, uint8_t cmng_type) { struct cmng_init_input input; int vn; memset(&input, 0, sizeof(struct cmng_init_input)); input.port_rate = sc->link_vars.line_speed; if (cmng_type == CMNG_FNS_MINMAX) { /* read mf conf from shmem */ if (read_cfg) { bxe_read_mf_cfg(sc); } /* get VN min rate and enable fairness if not 0 */ bxe_calc_vn_min(sc, &input); /* get VN max rate */ if (sc->port.pmf) { for (vn = VN_0; vn < SC_MAX_VN_NUM(sc); vn++) { bxe_calc_vn_max(sc, vn, &input); } } /* always enable rate shaping and fairness */ input.flags.cmng_enables |= CMNG_FLAGS_PER_PORT_RATE_SHAPING_VN; ecore_init_cmng(&input, &sc->cmng); return; } /* rate shaping and fairness are disabled */ BLOGD(sc, DBG_LOAD, "rate shaping and fairness have been disabled\n"); } static int bxe_get_cmng_fns_mode(struct bxe_softc *sc) { if (CHIP_REV_IS_SLOW(sc)) { return (CMNG_FNS_NONE); } if (IS_MF(sc)) { return (CMNG_FNS_MINMAX); } return (CMNG_FNS_NONE); } static void storm_memset_cmng(struct bxe_softc *sc, struct cmng_init *cmng, uint8_t port) { int vn; int func; uint32_t addr; size_t size; addr = (BAR_XSTRORM_INTMEM + XSTORM_CMNG_PER_PORT_VARS_OFFSET(port)); size = sizeof(struct cmng_struct_per_port); ecore_storm_memset_struct(sc, addr, size, (uint32_t *)&cmng->port); for (vn = VN_0; vn < SC_MAX_VN_NUM(sc); vn++) { func = func_by_vn(sc, vn); addr = (BAR_XSTRORM_INTMEM + XSTORM_RATE_SHAPING_PER_VN_VARS_OFFSET(func)); size = sizeof(struct rate_shaping_vars_per_vn); ecore_storm_memset_struct(sc, addr, size, (uint32_t *)&cmng->vnic.vnic_max_rate[vn]); addr = (BAR_XSTRORM_INTMEM + XSTORM_FAIRNESS_PER_VN_VARS_OFFSET(func)); size = sizeof(struct fairness_vars_per_vn); ecore_storm_memset_struct(sc, addr, size, (uint32_t *)&cmng->vnic.vnic_min_rate[vn]); } } static void bxe_pf_init(struct bxe_softc *sc) { struct bxe_func_init_params func_init = { 0 }; struct event_ring_data eq_data = { { 0 } }; uint16_t flags; if (!CHIP_IS_E1x(sc)) { /* reset IGU PF statistics: MSIX + ATTN */ /* PF */ REG_WR(sc, (IGU_REG_STATISTIC_NUM_MESSAGE_SENT + (BXE_IGU_STAS_MSG_VF_CNT * 4) + ((CHIP_IS_MODE_4_PORT(sc) ? SC_FUNC(sc) : SC_VN(sc)) * 4)), 0); /* ATTN */ REG_WR(sc, (IGU_REG_STATISTIC_NUM_MESSAGE_SENT + (BXE_IGU_STAS_MSG_VF_CNT * 4) + (BXE_IGU_STAS_MSG_PF_CNT * 4) + ((CHIP_IS_MODE_4_PORT(sc) ? SC_FUNC(sc) : SC_VN(sc)) * 4)), 0); } /* function setup flags */ flags = (FUNC_FLG_STATS | FUNC_FLG_LEADING | FUNC_FLG_SPQ); /* * This flag is relevant for E1x only. * E2 doesn't have a TPA configuration in a function level. */ flags |= (if_getcapenable(sc->ifp) & IFCAP_LRO) ? FUNC_FLG_TPA : 0; func_init.func_flgs = flags; func_init.pf_id = SC_FUNC(sc); func_init.func_id = SC_FUNC(sc); func_init.spq_map = sc->spq_dma.paddr; func_init.spq_prod = sc->spq_prod_idx; bxe_func_init(sc, &func_init); memset(&sc->cmng, 0, sizeof(struct cmng_struct_per_port)); /* * Congestion management values depend on the link rate. * There is no active link so initial link rate is set to 10Gbps. * When the link comes up the congestion management values are * re-calculated according to the actual link rate. */ sc->link_vars.line_speed = SPEED_10000; bxe_cmng_fns_init(sc, TRUE, bxe_get_cmng_fns_mode(sc)); /* Only the PMF sets the HW */ if (sc->port.pmf) { storm_memset_cmng(sc, &sc->cmng, SC_PORT(sc)); } /* init Event Queue - PCI bus guarantees correct endainity */ eq_data.base_addr.hi = U64_HI(sc->eq_dma.paddr); eq_data.base_addr.lo = U64_LO(sc->eq_dma.paddr); eq_data.producer = sc->eq_prod; eq_data.index_id = HC_SP_INDEX_EQ_CONS; eq_data.sb_id = DEF_SB_ID; storm_memset_eq_data(sc, &eq_data, SC_FUNC(sc)); } static void bxe_hc_int_enable(struct bxe_softc *sc) { int port = SC_PORT(sc); uint32_t addr = (port) ? HC_REG_CONFIG_1 : HC_REG_CONFIG_0; uint32_t val = REG_RD(sc, addr); uint8_t msix = (sc->interrupt_mode == INTR_MODE_MSIX) ? TRUE : FALSE; uint8_t single_msix = ((sc->interrupt_mode == INTR_MODE_MSIX) && (sc->intr_count == 1)) ? TRUE : FALSE; uint8_t msi = (sc->interrupt_mode == INTR_MODE_MSI) ? TRUE : FALSE; if (msix) { val &= ~(HC_CONFIG_0_REG_SINGLE_ISR_EN_0 | HC_CONFIG_0_REG_INT_LINE_EN_0); val |= (HC_CONFIG_0_REG_MSI_MSIX_INT_EN_0 | HC_CONFIG_0_REG_ATTN_BIT_EN_0); if (single_msix) { val |= HC_CONFIG_0_REG_SINGLE_ISR_EN_0; } } else if (msi) { val &= ~HC_CONFIG_0_REG_INT_LINE_EN_0; val |= (HC_CONFIG_0_REG_SINGLE_ISR_EN_0 | HC_CONFIG_0_REG_MSI_MSIX_INT_EN_0 | HC_CONFIG_0_REG_ATTN_BIT_EN_0); } else { val |= (HC_CONFIG_0_REG_SINGLE_ISR_EN_0 | HC_CONFIG_0_REG_MSI_MSIX_INT_EN_0 | HC_CONFIG_0_REG_INT_LINE_EN_0 | HC_CONFIG_0_REG_ATTN_BIT_EN_0); if (!CHIP_IS_E1(sc)) { BLOGD(sc, DBG_INTR, "write %x to HC %d (addr 0x%x)\n", val, port, addr); REG_WR(sc, addr, val); val &= ~HC_CONFIG_0_REG_MSI_MSIX_INT_EN_0; } } if (CHIP_IS_E1(sc)) { REG_WR(sc, (HC_REG_INT_MASK + port*4), 0x1FFFF); } BLOGD(sc, DBG_INTR, "write %x to HC %d (addr 0x%x) mode %s\n", val, port, addr, ((msix) ? "MSI-X" : ((msi) ? "MSI" : "INTx"))); REG_WR(sc, addr, val); /* ensure that HC_CONFIG is written before leading/trailing edge config */ mb(); if (!CHIP_IS_E1(sc)) { /* init leading/trailing edge */ if (IS_MF(sc)) { val = (0xee0f | (1 << (SC_VN(sc) + 4))); if (sc->port.pmf) { /* enable nig and gpio3 attention */ val |= 0x1100; } } else { val = 0xffff; } REG_WR(sc, (HC_REG_TRAILING_EDGE_0 + port*8), val); REG_WR(sc, (HC_REG_LEADING_EDGE_0 + port*8), val); } /* make sure that interrupts are indeed enabled from here on */ mb(); } static void bxe_igu_int_enable(struct bxe_softc *sc) { uint32_t val; uint8_t msix = (sc->interrupt_mode == INTR_MODE_MSIX) ? TRUE : FALSE; uint8_t single_msix = ((sc->interrupt_mode == INTR_MODE_MSIX) && (sc->intr_count == 1)) ? TRUE : FALSE; uint8_t msi = (sc->interrupt_mode == INTR_MODE_MSI) ? TRUE : FALSE; val = REG_RD(sc, IGU_REG_PF_CONFIGURATION); if (msix) { val &= ~(IGU_PF_CONF_INT_LINE_EN | IGU_PF_CONF_SINGLE_ISR_EN); val |= (IGU_PF_CONF_MSI_MSIX_EN | IGU_PF_CONF_ATTN_BIT_EN); if (single_msix) { val |= IGU_PF_CONF_SINGLE_ISR_EN; } } else if (msi) { val &= ~IGU_PF_CONF_INT_LINE_EN; val |= (IGU_PF_CONF_MSI_MSIX_EN | IGU_PF_CONF_ATTN_BIT_EN | IGU_PF_CONF_SINGLE_ISR_EN); } else { val &= ~IGU_PF_CONF_MSI_MSIX_EN; val |= (IGU_PF_CONF_INT_LINE_EN | IGU_PF_CONF_ATTN_BIT_EN | IGU_PF_CONF_SINGLE_ISR_EN); } /* clean previous status - need to configure igu prior to ack*/ if ((!msix) || single_msix) { REG_WR(sc, IGU_REG_PF_CONFIGURATION, val); bxe_ack_int(sc); } val |= IGU_PF_CONF_FUNC_EN; BLOGD(sc, DBG_INTR, "write 0x%x to IGU mode %s\n", val, ((msix) ? "MSI-X" : ((msi) ? "MSI" : "INTx"))); REG_WR(sc, IGU_REG_PF_CONFIGURATION, val); mb(); /* init leading/trailing edge */ if (IS_MF(sc)) { val = (0xee0f | (1 << (SC_VN(sc) + 4))); if (sc->port.pmf) { /* enable nig and gpio3 attention */ val |= 0x1100; } } else { val = 0xffff; } REG_WR(sc, IGU_REG_TRAILING_EDGE_LATCH, val); REG_WR(sc, IGU_REG_LEADING_EDGE_LATCH, val); /* make sure that interrupts are indeed enabled from here on */ mb(); } static void bxe_int_enable(struct bxe_softc *sc) { if (sc->devinfo.int_block == INT_BLOCK_HC) { bxe_hc_int_enable(sc); } else { bxe_igu_int_enable(sc); } } static void bxe_hc_int_disable(struct bxe_softc *sc) { int port = SC_PORT(sc); uint32_t addr = (port) ? HC_REG_CONFIG_1 : HC_REG_CONFIG_0; uint32_t val = REG_RD(sc, addr); /* * In E1 we must use only PCI configuration space to disable MSI/MSIX * capablility. It's forbidden to disable IGU_PF_CONF_MSI_MSIX_EN in HC * block */ if (CHIP_IS_E1(sc)) { /* * Since IGU_PF_CONF_MSI_MSIX_EN still always on use mask register * to prevent from HC sending interrupts after we exit the function */ REG_WR(sc, (HC_REG_INT_MASK + port*4), 0); val &= ~(HC_CONFIG_0_REG_SINGLE_ISR_EN_0 | HC_CONFIG_0_REG_INT_LINE_EN_0 | HC_CONFIG_0_REG_ATTN_BIT_EN_0); } else { val &= ~(HC_CONFIG_0_REG_SINGLE_ISR_EN_0 | HC_CONFIG_0_REG_MSI_MSIX_INT_EN_0 | HC_CONFIG_0_REG_INT_LINE_EN_0 | HC_CONFIG_0_REG_ATTN_BIT_EN_0); } BLOGD(sc, DBG_INTR, "write %x to HC %d (addr 0x%x)\n", val, port, addr); /* flush all outstanding writes */ mb(); REG_WR(sc, addr, val); if (REG_RD(sc, addr) != val) { BLOGE(sc, "proper val not read from HC IGU!\n"); } } static void bxe_igu_int_disable(struct bxe_softc *sc) { uint32_t val = REG_RD(sc, IGU_REG_PF_CONFIGURATION); val &= ~(IGU_PF_CONF_MSI_MSIX_EN | IGU_PF_CONF_INT_LINE_EN | IGU_PF_CONF_ATTN_BIT_EN); BLOGD(sc, DBG_INTR, "write %x to IGU\n", val); /* flush all outstanding writes */ mb(); REG_WR(sc, IGU_REG_PF_CONFIGURATION, val); if (REG_RD(sc, IGU_REG_PF_CONFIGURATION) != val) { BLOGE(sc, "proper val not read from IGU!\n"); } } static void bxe_int_disable(struct bxe_softc *sc) { if (sc->devinfo.int_block == INT_BLOCK_HC) { bxe_hc_int_disable(sc); } else { bxe_igu_int_disable(sc); } } static void bxe_nic_init(struct bxe_softc *sc, int load_code) { int i; for (i = 0; i < sc->num_queues; i++) { bxe_init_eth_fp(sc, i); } rmb(); /* ensure status block indices were read */ bxe_init_rx_rings(sc); bxe_init_tx_rings(sc); if (IS_VF(sc)) { return; } /* initialize MOD_ABS interrupts */ elink_init_mod_abs_int(sc, &sc->link_vars, sc->devinfo.chip_id, sc->devinfo.shmem_base, sc->devinfo.shmem2_base, SC_PORT(sc)); bxe_init_def_sb(sc); bxe_update_dsb_idx(sc); bxe_init_sp_ring(sc); bxe_init_eq_ring(sc); bxe_init_internal(sc, load_code); bxe_pf_init(sc); bxe_stats_init(sc); /* flush all before enabling interrupts */ mb(); bxe_int_enable(sc); /* check for SPIO5 */ bxe_attn_int_deasserted0(sc, REG_RD(sc, (MISC_REG_AEU_AFTER_INVERT_1_FUNC_0 + SC_PORT(sc)*4)) & AEU_INPUTS_ATTN_BITS_SPIO5); } static inline void bxe_init_objs(struct bxe_softc *sc) { /* mcast rules must be added to tx if tx switching is enabled */ ecore_obj_type o_type = (sc->flags & BXE_TX_SWITCHING) ? ECORE_OBJ_TYPE_RX_TX : ECORE_OBJ_TYPE_RX; /* RX_MODE controlling object */ ecore_init_rx_mode_obj(sc, &sc->rx_mode_obj); /* multicast configuration controlling object */ ecore_init_mcast_obj(sc, &sc->mcast_obj, sc->fp[0].cl_id, sc->fp[0].index, SC_FUNC(sc), SC_FUNC(sc), BXE_SP(sc, mcast_rdata), BXE_SP_MAPPING(sc, mcast_rdata), ECORE_FILTER_MCAST_PENDING, &sc->sp_state, o_type); /* Setup CAM credit pools */ ecore_init_mac_credit_pool(sc, &sc->macs_pool, SC_FUNC(sc), CHIP_IS_E1x(sc) ? VNICS_PER_PORT(sc) : VNICS_PER_PATH(sc)); ecore_init_vlan_credit_pool(sc, &sc->vlans_pool, SC_ABS_FUNC(sc) >> 1, CHIP_IS_E1x(sc) ? VNICS_PER_PORT(sc) : VNICS_PER_PATH(sc)); /* RSS configuration object */ ecore_init_rss_config_obj(sc, &sc->rss_conf_obj, sc->fp[0].cl_id, sc->fp[0].index, SC_FUNC(sc), SC_FUNC(sc), BXE_SP(sc, rss_rdata), BXE_SP_MAPPING(sc, rss_rdata), ECORE_FILTER_RSS_CONF_PENDING, &sc->sp_state, ECORE_OBJ_TYPE_RX); } /* * Initialize the function. This must be called before sending CLIENT_SETUP * for the first client. */ static inline int bxe_func_start(struct bxe_softc *sc) { struct ecore_func_state_params func_params = { NULL }; struct ecore_func_start_params *start_params = &func_params.params.start; /* Prepare parameters for function state transitions */ bit_set(&func_params.ramrod_flags, RAMROD_COMP_WAIT); func_params.f_obj = &sc->func_obj; func_params.cmd = ECORE_F_CMD_START; /* Function parameters */ start_params->mf_mode = sc->devinfo.mf_info.mf_mode; start_params->sd_vlan_tag = OVLAN(sc); if (CHIP_IS_E2(sc) || CHIP_IS_E3(sc)) { start_params->network_cos_mode = STATIC_COS; } else { /* CHIP_IS_E1X */ start_params->network_cos_mode = FW_WRR; } start_params->gre_tunnel_mode = 0; start_params->gre_tunnel_rss = 0; return (ecore_func_state_change(sc, &func_params)); } static int bxe_set_power_state(struct bxe_softc *sc, uint8_t state) { uint16_t pmcsr; /* If there is no power capability, silently succeed */ if (!(sc->devinfo.pcie_cap_flags & BXE_PM_CAPABLE_FLAG)) { BLOGW(sc, "No power capability\n"); return (0); } pmcsr = pci_read_config(sc->dev, (sc->devinfo.pcie_pm_cap_reg + PCIR_POWER_STATUS), 2); switch (state) { case PCI_PM_D0: pci_write_config(sc->dev, (sc->devinfo.pcie_pm_cap_reg + PCIR_POWER_STATUS), ((pmcsr & ~PCIM_PSTAT_DMASK) | PCIM_PSTAT_PME), 2); if (pmcsr & PCIM_PSTAT_DMASK) { /* delay required during transition out of D3hot */ DELAY(20000); } break; case PCI_PM_D3hot: /* XXX if there are other clients above don't shut down the power */ /* don't shut down the power for emulation and FPGA */ if (CHIP_REV_IS_SLOW(sc)) { return (0); } pmcsr &= ~PCIM_PSTAT_DMASK; pmcsr |= PCIM_PSTAT_D3; if (sc->wol) { pmcsr |= PCIM_PSTAT_PMEENABLE; } pci_write_config(sc->dev, (sc->devinfo.pcie_pm_cap_reg + PCIR_POWER_STATUS), pmcsr, 4); /* * No more memory access after this point until device is brought back * to D0 state. */ break; default: BLOGE(sc, "Can't support PCI power state = %d\n", state); return (-1); } return (0); } /* return true if succeeded to acquire the lock */ static uint8_t bxe_trylock_hw_lock(struct bxe_softc *sc, uint32_t resource) { uint32_t lock_status; uint32_t resource_bit = (1 << resource); int func = SC_FUNC(sc); uint32_t hw_lock_control_reg; BLOGD(sc, DBG_LOAD, "Trying to take a resource lock 0x%x\n", resource); /* Validating that the resource is within range */ if (resource > HW_LOCK_MAX_RESOURCE_VALUE) { BLOGD(sc, DBG_LOAD, "resource(0x%x) > HW_LOCK_MAX_RESOURCE_VALUE(0x%x)\n", resource, HW_LOCK_MAX_RESOURCE_VALUE); return (FALSE); } if (func <= 5) { hw_lock_control_reg = (MISC_REG_DRIVER_CONTROL_1 + func*8); } else { hw_lock_control_reg = (MISC_REG_DRIVER_CONTROL_7 + (func - 6)*8); } /* try to acquire the lock */ REG_WR(sc, hw_lock_control_reg + 4, resource_bit); lock_status = REG_RD(sc, hw_lock_control_reg); if (lock_status & resource_bit) { return (TRUE); } BLOGE(sc, "Failed to get a resource lock 0x%x\n", resource); return (FALSE); } /* * Get the recovery leader resource id according to the engine this function * belongs to. Currently only only 2 engines is supported. */ static int bxe_get_leader_lock_resource(struct bxe_softc *sc) { if (SC_PATH(sc)) { return (HW_LOCK_RESOURCE_RECOVERY_LEADER_1); } else { return (HW_LOCK_RESOURCE_RECOVERY_LEADER_0); } } /* try to acquire a leader lock for current engine */ static uint8_t bxe_trylock_leader_lock(struct bxe_softc *sc) { return (bxe_trylock_hw_lock(sc, bxe_get_leader_lock_resource(sc))); } static int bxe_release_leader_lock(struct bxe_softc *sc) { return (bxe_release_hw_lock(sc, bxe_get_leader_lock_resource(sc))); } /* close gates #2, #3 and #4 */ static void bxe_set_234_gates(struct bxe_softc *sc, uint8_t close) { uint32_t val; /* gates #2 and #4a are closed/opened for "not E1" only */ if (!CHIP_IS_E1(sc)) { /* #4 */ REG_WR(sc, PXP_REG_HST_DISCARD_DOORBELLS, !!close); /* #2 */ REG_WR(sc, PXP_REG_HST_DISCARD_INTERNAL_WRITES, !!close); } /* #3 */ if (CHIP_IS_E1x(sc)) { /* prevent interrupts from HC on both ports */ val = REG_RD(sc, HC_REG_CONFIG_1); REG_WR(sc, HC_REG_CONFIG_1, (!close) ? (val | HC_CONFIG_1_REG_BLOCK_DISABLE_1) : (val & ~(uint32_t)HC_CONFIG_1_REG_BLOCK_DISABLE_1)); val = REG_RD(sc, HC_REG_CONFIG_0); REG_WR(sc, HC_REG_CONFIG_0, (!close) ? (val | HC_CONFIG_0_REG_BLOCK_DISABLE_0) : (val & ~(uint32_t)HC_CONFIG_0_REG_BLOCK_DISABLE_0)); } else { /* Prevent incomming interrupts in IGU */ val = REG_RD(sc, IGU_REG_BLOCK_CONFIGURATION); REG_WR(sc, IGU_REG_BLOCK_CONFIGURATION, (!close) ? (val | IGU_BLOCK_CONFIGURATION_REG_BLOCK_ENABLE) : (val & ~(uint32_t)IGU_BLOCK_CONFIGURATION_REG_BLOCK_ENABLE)); } BLOGD(sc, DBG_LOAD, "%s gates #2, #3 and #4\n", close ? "closing" : "opening"); wmb(); } /* poll for pending writes bit, it should get cleared in no more than 1s */ static int bxe_er_poll_igu_vq(struct bxe_softc *sc) { uint32_t cnt = 1000; uint32_t pend_bits = 0; do { pend_bits = REG_RD(sc, IGU_REG_PENDING_BITS_STATUS); if (pend_bits == 0) { break; } DELAY(1000); } while (--cnt > 0); if (cnt == 0) { BLOGE(sc, "Still pending IGU requests bits=0x%08x!\n", pend_bits); return (-1); } return (0); } #define SHARED_MF_CLP_MAGIC 0x80000000 /* 'magic' bit */ static void bxe_clp_reset_prep(struct bxe_softc *sc, uint32_t *magic_val) { /* Do some magic... */ uint32_t val = MFCFG_RD(sc, shared_mf_config.clp_mb); *magic_val = val & SHARED_MF_CLP_MAGIC; MFCFG_WR(sc, shared_mf_config.clp_mb, val | SHARED_MF_CLP_MAGIC); } /* restore the value of the 'magic' bit */ static void bxe_clp_reset_done(struct bxe_softc *sc, uint32_t magic_val) { /* Restore the 'magic' bit value... */ uint32_t val = MFCFG_RD(sc, shared_mf_config.clp_mb); MFCFG_WR(sc, shared_mf_config.clp_mb, (val & (~SHARED_MF_CLP_MAGIC)) | magic_val); } /* prepare for MCP reset, takes care of CLP configurations */ static void bxe_reset_mcp_prep(struct bxe_softc *sc, uint32_t *magic_val) { uint32_t shmem; uint32_t validity_offset; /* set `magic' bit in order to save MF config */ if (!CHIP_IS_E1(sc)) { bxe_clp_reset_prep(sc, magic_val); } /* get shmem offset */ shmem = REG_RD(sc, MISC_REG_SHARED_MEM_ADDR); validity_offset = offsetof(struct shmem_region, validity_map[SC_PORT(sc)]); /* Clear validity map flags */ if (shmem > 0) { REG_WR(sc, shmem + validity_offset, 0); } } #define MCP_TIMEOUT 5000 /* 5 seconds (in ms) */ #define MCP_ONE_TIMEOUT 100 /* 100 ms */ static void bxe_mcp_wait_one(struct bxe_softc *sc) { /* special handling for emulation and FPGA (10 times longer) */ if (CHIP_REV_IS_SLOW(sc)) { DELAY((MCP_ONE_TIMEOUT*10) * 1000); } else { DELAY((MCP_ONE_TIMEOUT) * 1000); } } /* initialize shmem_base and waits for validity signature to appear */ static int bxe_init_shmem(struct bxe_softc *sc) { int cnt = 0; uint32_t val = 0; do { sc->devinfo.shmem_base = sc->link_params.shmem_base = REG_RD(sc, MISC_REG_SHARED_MEM_ADDR); if (sc->devinfo.shmem_base) { val = SHMEM_RD(sc, validity_map[SC_PORT(sc)]); if (val & SHR_MEM_VALIDITY_MB) return (0); } bxe_mcp_wait_one(sc); } while (cnt++ < (MCP_TIMEOUT / MCP_ONE_TIMEOUT)); BLOGE(sc, "BAD MCP validity signature\n"); return (-1); } static int bxe_reset_mcp_comp(struct bxe_softc *sc, uint32_t magic_val) { int rc = bxe_init_shmem(sc); /* Restore the `magic' bit value */ if (!CHIP_IS_E1(sc)) { bxe_clp_reset_done(sc, magic_val); } return (rc); } static void bxe_pxp_prep(struct bxe_softc *sc) { if (!CHIP_IS_E1(sc)) { REG_WR(sc, PXP2_REG_RD_START_INIT, 0); REG_WR(sc, PXP2_REG_RQ_RBC_DONE, 0); wmb(); } } /* * Reset the whole chip except for: * - PCIE core * - PCI Glue, PSWHST, PXP/PXP2 RF (all controlled by one reset bit) * - IGU * - MISC (including AEU) * - GRC * - RBCN, RBCP */ static void bxe_process_kill_chip_reset(struct bxe_softc *sc, uint8_t global) { uint32_t not_reset_mask1, reset_mask1, not_reset_mask2, reset_mask2; uint32_t global_bits2, stay_reset2; /* * Bits that have to be set in reset_mask2 if we want to reset 'global' * (per chip) blocks. */ global_bits2 = MISC_REGISTERS_RESET_REG_2_RST_MCP_N_RESET_CMN_CPU | MISC_REGISTERS_RESET_REG_2_RST_MCP_N_RESET_CMN_CORE; /* * Don't reset the following blocks. * Important: per port blocks (such as EMAC, BMAC, UMAC) can't be * reset, as in 4 port device they might still be owned * by the MCP (there is only one leader per path). */ not_reset_mask1 = MISC_REGISTERS_RESET_REG_1_RST_HC | MISC_REGISTERS_RESET_REG_1_RST_PXPV | MISC_REGISTERS_RESET_REG_1_RST_PXP; not_reset_mask2 = MISC_REGISTERS_RESET_REG_2_RST_PCI_MDIO | MISC_REGISTERS_RESET_REG_2_RST_EMAC0_HARD_CORE | MISC_REGISTERS_RESET_REG_2_RST_EMAC1_HARD_CORE | MISC_REGISTERS_RESET_REG_2_RST_MISC_CORE | MISC_REGISTERS_RESET_REG_2_RST_RBCN | MISC_REGISTERS_RESET_REG_2_RST_GRC | MISC_REGISTERS_RESET_REG_2_RST_MCP_N_RESET_REG_HARD_CORE | MISC_REGISTERS_RESET_REG_2_RST_MCP_N_HARD_CORE_RST_B | MISC_REGISTERS_RESET_REG_2_RST_ATC | MISC_REGISTERS_RESET_REG_2_PGLC | MISC_REGISTERS_RESET_REG_2_RST_BMAC0 | MISC_REGISTERS_RESET_REG_2_RST_BMAC1 | MISC_REGISTERS_RESET_REG_2_RST_EMAC0 | MISC_REGISTERS_RESET_REG_2_RST_EMAC1 | MISC_REGISTERS_RESET_REG_2_UMAC0 | MISC_REGISTERS_RESET_REG_2_UMAC1; /* * Keep the following blocks in reset: * - all xxMACs are handled by the elink code. */ stay_reset2 = MISC_REGISTERS_RESET_REG_2_XMAC | MISC_REGISTERS_RESET_REG_2_XMAC_SOFT; /* Full reset masks according to the chip */ reset_mask1 = 0xffffffff; if (CHIP_IS_E1(sc)) reset_mask2 = 0xffff; else if (CHIP_IS_E1H(sc)) reset_mask2 = 0x1ffff; else if (CHIP_IS_E2(sc)) reset_mask2 = 0xfffff; else /* CHIP_IS_E3 */ reset_mask2 = 0x3ffffff; /* Don't reset global blocks unless we need to */ if (!global) reset_mask2 &= ~global_bits2; /* * In case of attention in the QM, we need to reset PXP * (MISC_REGISTERS_RESET_REG_2_RST_PXP_RQ_RD_WR) before QM * because otherwise QM reset would release 'close the gates' shortly * before resetting the PXP, then the PSWRQ would send a write * request to PGLUE. Then when PXP is reset, PGLUE would try to * read the payload data from PSWWR, but PSWWR would not * respond. The write queue in PGLUE would stuck, dmae commands * would not return. Therefore it's important to reset the second * reset register (containing the * MISC_REGISTERS_RESET_REG_2_RST_PXP_RQ_RD_WR bit) before the * first one (containing the MISC_REGISTERS_RESET_REG_1_RST_QM * bit). */ REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_2_CLEAR, reset_mask2 & (~not_reset_mask2)); REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_CLEAR, reset_mask1 & (~not_reset_mask1)); mb(); wmb(); REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_2_SET, reset_mask2 & (~stay_reset2)); mb(); wmb(); REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_SET, reset_mask1); wmb(); } static int bxe_process_kill(struct bxe_softc *sc, uint8_t global) { int cnt = 1000; uint32_t val = 0; uint32_t sr_cnt, blk_cnt, port_is_idle_0, port_is_idle_1, pgl_exp_rom2; uint32_t tags_63_32 = 0; /* Empty the Tetris buffer, wait for 1s */ do { sr_cnt = REG_RD(sc, PXP2_REG_RD_SR_CNT); blk_cnt = REG_RD(sc, PXP2_REG_RD_BLK_CNT); port_is_idle_0 = REG_RD(sc, PXP2_REG_RD_PORT_IS_IDLE_0); port_is_idle_1 = REG_RD(sc, PXP2_REG_RD_PORT_IS_IDLE_1); pgl_exp_rom2 = REG_RD(sc, PXP2_REG_PGL_EXP_ROM2); if (CHIP_IS_E3(sc)) { tags_63_32 = REG_RD(sc, PGLUE_B_REG_TAGS_63_32); } if ((sr_cnt == 0x7e) && (blk_cnt == 0xa0) && ((port_is_idle_0 & 0x1) == 0x1) && ((port_is_idle_1 & 0x1) == 0x1) && (pgl_exp_rom2 == 0xffffffff) && (!CHIP_IS_E3(sc) || (tags_63_32 == 0xffffffff))) break; DELAY(1000); } while (cnt-- > 0); if (cnt <= 0) { BLOGE(sc, "ERROR: Tetris buffer didn't get empty or there " "are still outstanding read requests after 1s! " "sr_cnt=0x%08x, blk_cnt=0x%08x, port_is_idle_0=0x%08x, " "port_is_idle_1=0x%08x, pgl_exp_rom2=0x%08x\n", sr_cnt, blk_cnt, port_is_idle_0, port_is_idle_1, pgl_exp_rom2); return (-1); } mb(); /* Close gates #2, #3 and #4 */ bxe_set_234_gates(sc, TRUE); /* Poll for IGU VQs for 57712 and newer chips */ if (!CHIP_IS_E1x(sc) && bxe_er_poll_igu_vq(sc)) { return (-1); } /* XXX indicate that "process kill" is in progress to MCP */ /* clear "unprepared" bit */ REG_WR(sc, MISC_REG_UNPREPARED, 0); mb(); /* Make sure all is written to the chip before the reset */ wmb(); /* * Wait for 1ms to empty GLUE and PCI-E core queues, * PSWHST, GRC and PSWRD Tetris buffer. */ DELAY(1000); /* Prepare to chip reset: */ /* MCP */ if (global) { bxe_reset_mcp_prep(sc, &val); } /* PXP */ bxe_pxp_prep(sc); mb(); /* reset the chip */ bxe_process_kill_chip_reset(sc, global); mb(); /* clear errors in PGB */ if (!CHIP_IS_E1(sc)) REG_WR(sc, PGLUE_B_REG_LATCHED_ERRORS_CLR, 0x7f); /* Recover after reset: */ /* MCP */ if (global && bxe_reset_mcp_comp(sc, val)) { return (-1); } /* XXX add resetting the NO_MCP mode DB here */ /* Open the gates #2, #3 and #4 */ bxe_set_234_gates(sc, FALSE); /* XXX * IGU/AEU preparation bring back the AEU/IGU to a reset state * re-enable attentions */ return (0); } static int bxe_leader_reset(struct bxe_softc *sc) { int rc = 0; uint8_t global = bxe_reset_is_global(sc); uint32_t load_code; /* * If not going to reset MCP, load "fake" driver to reset HW while * driver is owner of the HW. */ if (!global && !BXE_NOMCP(sc)) { load_code = bxe_fw_command(sc, DRV_MSG_CODE_LOAD_REQ, DRV_MSG_CODE_LOAD_REQ_WITH_LFA); if (!load_code) { BLOGE(sc, "MCP response failure, aborting\n"); rc = -1; goto exit_leader_reset; } if ((load_code != FW_MSG_CODE_DRV_LOAD_COMMON_CHIP) && (load_code != FW_MSG_CODE_DRV_LOAD_COMMON)) { BLOGE(sc, "MCP unexpected response, aborting\n"); rc = -1; goto exit_leader_reset2; } load_code = bxe_fw_command(sc, DRV_MSG_CODE_LOAD_DONE, 0); if (!load_code) { BLOGE(sc, "MCP response failure, aborting\n"); rc = -1; goto exit_leader_reset2; } } /* try to recover after the failure */ if (bxe_process_kill(sc, global)) { BLOGE(sc, "Something bad occurred on engine %d!\n", SC_PATH(sc)); rc = -1; goto exit_leader_reset2; } /* * Clear the RESET_IN_PROGRESS and RESET_GLOBAL bits and update the driver * state. */ bxe_set_reset_done(sc); if (global) { bxe_clear_reset_global(sc); } exit_leader_reset2: /* unload "fake driver" if it was loaded */ if (!global && !BXE_NOMCP(sc)) { bxe_fw_command(sc, DRV_MSG_CODE_UNLOAD_REQ_WOL_MCP, 0); bxe_fw_command(sc, DRV_MSG_CODE_UNLOAD_DONE, 0); } exit_leader_reset: sc->is_leader = 0; bxe_release_leader_lock(sc); mb(); return (rc); } /* * prepare INIT transition, parameters configured: * - HC configuration * - Queue's CDU context */ static void bxe_pf_q_prep_init(struct bxe_softc *sc, struct bxe_fastpath *fp, struct ecore_queue_init_params *init_params) { uint8_t cos; int cxt_index, cxt_offset; bxe_set_bit(ECORE_Q_FLG_HC, &init_params->rx.flags); bxe_set_bit(ECORE_Q_FLG_HC, &init_params->tx.flags); bxe_set_bit(ECORE_Q_FLG_HC_EN, &init_params->rx.flags); bxe_set_bit(ECORE_Q_FLG_HC_EN, &init_params->tx.flags); /* HC rate */ init_params->rx.hc_rate = sc->hc_rx_ticks ? (1000000 / sc->hc_rx_ticks) : 0; init_params->tx.hc_rate = sc->hc_tx_ticks ? (1000000 / sc->hc_tx_ticks) : 0; /* FW SB ID */ init_params->rx.fw_sb_id = init_params->tx.fw_sb_id = fp->fw_sb_id; /* CQ index among the SB indices */ init_params->rx.sb_cq_index = HC_INDEX_ETH_RX_CQ_CONS; init_params->tx.sb_cq_index = HC_INDEX_ETH_FIRST_TX_CQ_CONS; /* set maximum number of COSs supported by this queue */ init_params->max_cos = sc->max_cos; BLOGD(sc, DBG_LOAD, "fp %d setting queue params max cos to %d\n", fp->index, init_params->max_cos); /* set the context pointers queue object */ for (cos = FIRST_TX_COS_INDEX; cos < init_params->max_cos; cos++) { /* XXX change index/cid here if ever support multiple tx CoS */ /* fp->txdata[cos]->cid */ cxt_index = fp->index / ILT_PAGE_CIDS; cxt_offset = fp->index - (cxt_index * ILT_PAGE_CIDS); init_params->cxts[cos] = &sc->context[cxt_index].vcxt[cxt_offset].eth; } } /* set flags that are common for the Tx-only and not normal connections */ static unsigned long bxe_get_common_flags(struct bxe_softc *sc, struct bxe_fastpath *fp, uint8_t zero_stats) { unsigned long flags = 0; /* PF driver will always initialize the Queue to an ACTIVE state */ bxe_set_bit(ECORE_Q_FLG_ACTIVE, &flags); /* * tx only connections collect statistics (on the same index as the * parent connection). The statistics are zeroed when the parent * connection is initialized. */ bxe_set_bit(ECORE_Q_FLG_STATS, &flags); if (zero_stats) { bxe_set_bit(ECORE_Q_FLG_ZERO_STATS, &flags); } /* * tx only connections can support tx-switching, though their * CoS-ness doesn't survive the loopback */ if (sc->flags & BXE_TX_SWITCHING) { bxe_set_bit(ECORE_Q_FLG_TX_SWITCH, &flags); } bxe_set_bit(ECORE_Q_FLG_PCSUM_ON_PKT, &flags); return (flags); } static unsigned long bxe_get_q_flags(struct bxe_softc *sc, struct bxe_fastpath *fp, uint8_t leading) { unsigned long flags = 0; if (IS_MF_SD(sc)) { bxe_set_bit(ECORE_Q_FLG_OV, &flags); } if (if_getcapenable(sc->ifp) & IFCAP_LRO) { bxe_set_bit(ECORE_Q_FLG_TPA, &flags); bxe_set_bit(ECORE_Q_FLG_TPA_IPV6, &flags); #if 0 if (fp->mode == TPA_MODE_GRO) __set_bit(ECORE_Q_FLG_TPA_GRO, &flags); #endif } if (leading) { bxe_set_bit(ECORE_Q_FLG_LEADING_RSS, &flags); bxe_set_bit(ECORE_Q_FLG_MCAST, &flags); } bxe_set_bit(ECORE_Q_FLG_VLAN, &flags); #if 0 /* configure silent vlan removal */ if (IS_MF_AFEX(sc)) { bxe_set_bit(ECORE_Q_FLG_SILENT_VLAN_REM, &flags); } #endif /* merge with common flags */ return (flags | bxe_get_common_flags(sc, fp, TRUE)); } static void bxe_pf_q_prep_general(struct bxe_softc *sc, struct bxe_fastpath *fp, struct ecore_general_setup_params *gen_init, uint8_t cos) { gen_init->stat_id = bxe_stats_id(fp); gen_init->spcl_id = fp->cl_id; gen_init->mtu = sc->mtu; gen_init->cos = cos; } static void bxe_pf_rx_q_prep(struct bxe_softc *sc, struct bxe_fastpath *fp, struct rxq_pause_params *pause, struct ecore_rxq_setup_params *rxq_init) { uint8_t max_sge = 0; uint16_t sge_sz = 0; uint16_t tpa_agg_size = 0; if (if_getcapenable(sc->ifp) & IFCAP_LRO) { pause->sge_th_lo = SGE_TH_LO(sc); pause->sge_th_hi = SGE_TH_HI(sc); /* validate SGE ring has enough to cross high threshold */ if (sc->dropless_fc && (pause->sge_th_hi + FW_PREFETCH_CNT) > (RX_SGE_USABLE_PER_PAGE * RX_SGE_NUM_PAGES)) { BLOGW(sc, "sge ring threshold limit\n"); } /* minimum max_aggregation_size is 2*MTU (two full buffers) */ tpa_agg_size = (2 * sc->mtu); if (tpa_agg_size < sc->max_aggregation_size) { tpa_agg_size = sc->max_aggregation_size; } max_sge = SGE_PAGE_ALIGN(sc->mtu) >> SGE_PAGE_SHIFT; max_sge = ((max_sge + PAGES_PER_SGE - 1) & (~(PAGES_PER_SGE - 1))) >> PAGES_PER_SGE_SHIFT; sge_sz = (uint16_t)min(SGE_PAGES, 0xffff); } /* pause - not for e1 */ if (!CHIP_IS_E1(sc)) { pause->bd_th_lo = BD_TH_LO(sc); pause->bd_th_hi = BD_TH_HI(sc); pause->rcq_th_lo = RCQ_TH_LO(sc); pause->rcq_th_hi = RCQ_TH_HI(sc); /* validate rings have enough entries to cross high thresholds */ if (sc->dropless_fc && pause->bd_th_hi + FW_PREFETCH_CNT > sc->rx_ring_size) { BLOGW(sc, "rx bd ring threshold limit\n"); } if (sc->dropless_fc && pause->rcq_th_hi + FW_PREFETCH_CNT > RCQ_NUM_PAGES * RCQ_USABLE_PER_PAGE) { BLOGW(sc, "rcq ring threshold limit\n"); } pause->pri_map = 1; } /* rxq setup */ rxq_init->dscr_map = fp->rx_dma.paddr; rxq_init->sge_map = fp->rx_sge_dma.paddr; rxq_init->rcq_map = fp->rcq_dma.paddr; rxq_init->rcq_np_map = (fp->rcq_dma.paddr + BCM_PAGE_SIZE); /* * This should be a maximum number of data bytes that may be * placed on the BD (not including paddings). */ rxq_init->buf_sz = (fp->rx_buf_size - IP_HEADER_ALIGNMENT_PADDING); rxq_init->cl_qzone_id = fp->cl_qzone_id; rxq_init->tpa_agg_sz = tpa_agg_size; rxq_init->sge_buf_sz = sge_sz; rxq_init->max_sges_pkt = max_sge; rxq_init->rss_engine_id = SC_FUNC(sc); rxq_init->mcast_engine_id = SC_FUNC(sc); /* * Maximum number or simultaneous TPA aggregation for this Queue. * For PF Clients it should be the maximum available number. * VF driver(s) may want to define it to a smaller value. */ rxq_init->max_tpa_queues = MAX_AGG_QS(sc); rxq_init->cache_line_log = BXE_RX_ALIGN_SHIFT; rxq_init->fw_sb_id = fp->fw_sb_id; rxq_init->sb_cq_index = HC_INDEX_ETH_RX_CQ_CONS; /* * configure silent vlan removal * if multi function mode is afex, then mask default vlan */ if (IS_MF_AFEX(sc)) { rxq_init->silent_removal_value = sc->devinfo.mf_info.afex_def_vlan_tag; rxq_init->silent_removal_mask = EVL_VLID_MASK; } } static void bxe_pf_tx_q_prep(struct bxe_softc *sc, struct bxe_fastpath *fp, struct ecore_txq_setup_params *txq_init, uint8_t cos) { /* * XXX If multiple CoS is ever supported then each fastpath structure * will need to maintain tx producer/consumer/dma/etc values *per* CoS. * fp->txdata[cos]->tx_dma.paddr; */ txq_init->dscr_map = fp->tx_dma.paddr; txq_init->sb_cq_index = HC_INDEX_ETH_FIRST_TX_CQ_CONS + cos; txq_init->traffic_type = LLFC_TRAFFIC_TYPE_NW; txq_init->fw_sb_id = fp->fw_sb_id; /* * set the TSS leading client id for TX classfication to the * leading RSS client id */ txq_init->tss_leading_cl_id = BXE_FP(sc, 0, cl_id); } /* * This function performs 2 steps in a queue state machine: * 1) RESET->INIT * 2) INIT->SETUP */ static int bxe_setup_queue(struct bxe_softc *sc, struct bxe_fastpath *fp, uint8_t leading) { struct ecore_queue_state_params q_params = { NULL }; struct ecore_queue_setup_params *setup_params = &q_params.params.setup; #if 0 struct ecore_queue_setup_tx_only_params *tx_only_params = &q_params.params.tx_only; uint8_t tx_index; #endif int rc; BLOGD(sc, DBG_LOAD, "setting up queue %d\n", fp->index); bxe_ack_sb(sc, fp->igu_sb_id, USTORM_ID, 0, IGU_INT_ENABLE, 0); q_params.q_obj = &BXE_SP_OBJ(sc, fp).q_obj; /* we want to wait for completion in this context */ bxe_set_bit(RAMROD_COMP_WAIT, &q_params.ramrod_flags); /* prepare the INIT parameters */ bxe_pf_q_prep_init(sc, fp, &q_params.params.init); /* Set the command */ q_params.cmd = ECORE_Q_CMD_INIT; /* Change the state to INIT */ rc = ecore_queue_state_change(sc, &q_params); if (rc) { BLOGE(sc, "Queue(%d) INIT failed\n", fp->index); return (rc); } BLOGD(sc, DBG_LOAD, "init complete\n"); /* now move the Queue to the SETUP state */ memset(setup_params, 0, sizeof(*setup_params)); /* set Queue flags */ setup_params->flags = bxe_get_q_flags(sc, fp, leading); /* set general SETUP parameters */ bxe_pf_q_prep_general(sc, fp, &setup_params->gen_params, FIRST_TX_COS_INDEX); bxe_pf_rx_q_prep(sc, fp, &setup_params->pause_params, &setup_params->rxq_params); bxe_pf_tx_q_prep(sc, fp, &setup_params->txq_params, FIRST_TX_COS_INDEX); /* Set the command */ q_params.cmd = ECORE_Q_CMD_SETUP; /* change the state to SETUP */ rc = ecore_queue_state_change(sc, &q_params); if (rc) { BLOGE(sc, "Queue(%d) SETUP failed\n", fp->index); return (rc); } #if 0 /* loop through the relevant tx-only indices */ for (tx_index = FIRST_TX_ONLY_COS_INDEX; tx_index < sc->max_cos; tx_index++) { /* prepare and send tx-only ramrod*/ rc = bxe_setup_tx_only(sc, fp, &q_params, tx_only_params, tx_index, leading); if (rc) { BLOGE(sc, "Queue(%d.%d) TX_ONLY_SETUP failed\n", fp->index, tx_index); return (rc); } } #endif return (rc); } static int bxe_setup_leading(struct bxe_softc *sc) { return (bxe_setup_queue(sc, &sc->fp[0], TRUE)); } static int bxe_config_rss_pf(struct bxe_softc *sc, struct ecore_rss_config_obj *rss_obj, uint8_t config_hash) { struct ecore_config_rss_params params = { NULL }; int i; /* * Although RSS is meaningless when there is a single HW queue we * still need it enabled in order to have HW Rx hash generated. */ params.rss_obj = rss_obj; bxe_set_bit(RAMROD_COMP_WAIT, ¶ms.ramrod_flags); bxe_set_bit(ECORE_RSS_MODE_REGULAR, ¶ms.rss_flags); /* RSS configuration */ bxe_set_bit(ECORE_RSS_IPV4, ¶ms.rss_flags); bxe_set_bit(ECORE_RSS_IPV4_TCP, ¶ms.rss_flags); bxe_set_bit(ECORE_RSS_IPV6, ¶ms.rss_flags); bxe_set_bit(ECORE_RSS_IPV6_TCP, ¶ms.rss_flags); if (rss_obj->udp_rss_v4) { bxe_set_bit(ECORE_RSS_IPV4_UDP, ¶ms.rss_flags); } if (rss_obj->udp_rss_v6) { bxe_set_bit(ECORE_RSS_IPV6_UDP, ¶ms.rss_flags); } /* Hash bits */ params.rss_result_mask = MULTI_MASK; memcpy(params.ind_table, rss_obj->ind_table, sizeof(params.ind_table)); if (config_hash) { /* RSS keys */ for (i = 0; i < sizeof(params.rss_key) / 4; i++) { params.rss_key[i] = arc4random(); } bxe_set_bit(ECORE_RSS_SET_SRCH, ¶ms.rss_flags); } return (ecore_config_rss(sc, ¶ms)); } static int bxe_config_rss_eth(struct bxe_softc *sc, uint8_t config_hash) { return (bxe_config_rss_pf(sc, &sc->rss_conf_obj, config_hash)); } static int bxe_init_rss_pf(struct bxe_softc *sc) { uint8_t num_eth_queues = BXE_NUM_ETH_QUEUES(sc); int i; /* * Prepare the initial contents of the indirection table if * RSS is enabled */ for (i = 0; i < sizeof(sc->rss_conf_obj.ind_table); i++) { sc->rss_conf_obj.ind_table[i] = (sc->fp->cl_id + (i % num_eth_queues)); } if (sc->udp_rss) { sc->rss_conf_obj.udp_rss_v4 = sc->rss_conf_obj.udp_rss_v6 = 1; } /* * For 57710 and 57711 SEARCHER configuration (rss_keys) is * per-port, so if explicit configuration is needed, do it only * for a PMF. * * For 57712 and newer it's a per-function configuration. */ return (bxe_config_rss_eth(sc, sc->port.pmf || !CHIP_IS_E1x(sc))); } static int bxe_set_mac_one(struct bxe_softc *sc, uint8_t *mac, struct ecore_vlan_mac_obj *obj, uint8_t set, int mac_type, unsigned long *ramrod_flags) { struct ecore_vlan_mac_ramrod_params ramrod_param; int rc; memset(&ramrod_param, 0, sizeof(ramrod_param)); /* fill in general parameters */ ramrod_param.vlan_mac_obj = obj; ramrod_param.ramrod_flags = *ramrod_flags; /* fill a user request section if needed */ if (!bxe_test_bit(RAMROD_CONT, ramrod_flags)) { memcpy(ramrod_param.user_req.u.mac.mac, mac, ETH_ALEN); bxe_set_bit(mac_type, &ramrod_param.user_req.vlan_mac_flags); /* Set the command: ADD or DEL */ ramrod_param.user_req.cmd = (set) ? ECORE_VLAN_MAC_ADD : ECORE_VLAN_MAC_DEL; } rc = ecore_config_vlan_mac(sc, &ramrod_param); if (rc == ECORE_EXISTS) { BLOGD(sc, DBG_SP, "Failed to schedule ADD operations (EEXIST)\n"); /* do not treat adding same MAC as error */ rc = 0; } else if (rc < 0) { BLOGE(sc, "%s MAC failed (%d)\n", (set ? "Set" : "Delete"), rc); } return (rc); } static int bxe_set_eth_mac(struct bxe_softc *sc, uint8_t set) { unsigned long ramrod_flags = 0; BLOGD(sc, DBG_LOAD, "Adding Ethernet MAC\n"); bxe_set_bit(RAMROD_COMP_WAIT, &ramrod_flags); /* Eth MAC is set on RSS leading client (fp[0]) */ return (bxe_set_mac_one(sc, sc->link_params.mac_addr, &sc->sp_objs->mac_obj, set, ECORE_ETH_MAC, &ramrod_flags)); } #if 0 static void bxe_update_max_mf_config(struct bxe_softc *sc, uint32_t value) { /* load old values */ uint32_t mf_cfg = sc->devinfo.mf_info.mf_config[SC_VN(sc)]; if (value != bxe_extract_max_cfg(sc, mf_cfg)) { /* leave all but MAX value */ mf_cfg &= ~FUNC_MF_CFG_MAX_BW_MASK; /* set new MAX value */ mf_cfg |= ((value << FUNC_MF_CFG_MAX_BW_SHIFT) & FUNC_MF_CFG_MAX_BW_MASK); bxe_fw_command(sc, DRV_MSG_CODE_SET_MF_BW, mf_cfg); } } #endif static int bxe_get_cur_phy_idx(struct bxe_softc *sc) { uint32_t sel_phy_idx = 0; if (sc->link_params.num_phys <= 1) { return (ELINK_INT_PHY); } if (sc->link_vars.link_up) { sel_phy_idx = ELINK_EXT_PHY1; /* In case link is SERDES, check if the ELINK_EXT_PHY2 is the one */ if ((sc->link_vars.link_status & LINK_STATUS_SERDES_LINK) && (sc->link_params.phy[ELINK_EXT_PHY2].supported & ELINK_SUPPORTED_FIBRE)) sel_phy_idx = ELINK_EXT_PHY2; } else { switch (elink_phy_selection(&sc->link_params)) { case PORT_HW_CFG_PHY_SELECTION_HARDWARE_DEFAULT: case PORT_HW_CFG_PHY_SELECTION_FIRST_PHY: case PORT_HW_CFG_PHY_SELECTION_FIRST_PHY_PRIORITY: sel_phy_idx = ELINK_EXT_PHY1; break; case PORT_HW_CFG_PHY_SELECTION_SECOND_PHY: case PORT_HW_CFG_PHY_SELECTION_SECOND_PHY_PRIORITY: sel_phy_idx = ELINK_EXT_PHY2; break; } } return (sel_phy_idx); } static int bxe_get_link_cfg_idx(struct bxe_softc *sc) { uint32_t sel_phy_idx = bxe_get_cur_phy_idx(sc); /* * The selected activated PHY is always after swapping (in case PHY * swapping is enabled). So when swapping is enabled, we need to reverse * the configuration */ if (sc->link_params.multi_phy_config & PORT_HW_CFG_PHY_SWAPPED_ENABLED) { if (sel_phy_idx == ELINK_EXT_PHY1) sel_phy_idx = ELINK_EXT_PHY2; else if (sel_phy_idx == ELINK_EXT_PHY2) sel_phy_idx = ELINK_EXT_PHY1; } return (ELINK_LINK_CONFIG_IDX(sel_phy_idx)); } static void bxe_set_requested_fc(struct bxe_softc *sc) { /* * Initialize link parameters structure variables * It is recommended to turn off RX FC for jumbo frames * for better performance */ if (CHIP_IS_E1x(sc) && (sc->mtu > 5000)) { sc->link_params.req_fc_auto_adv = ELINK_FLOW_CTRL_TX; } else { sc->link_params.req_fc_auto_adv = ELINK_FLOW_CTRL_BOTH; } } static void bxe_calc_fc_adv(struct bxe_softc *sc) { uint8_t cfg_idx = bxe_get_link_cfg_idx(sc); switch (sc->link_vars.ieee_fc & MDIO_COMBO_IEEE0_AUTO_NEG_ADV_PAUSE_MASK) { case MDIO_COMBO_IEEE0_AUTO_NEG_ADV_PAUSE_NONE: default: sc->port.advertising[cfg_idx] &= ~(ADVERTISED_Asym_Pause | ADVERTISED_Pause); break; case MDIO_COMBO_IEEE0_AUTO_NEG_ADV_PAUSE_BOTH: sc->port.advertising[cfg_idx] |= (ADVERTISED_Asym_Pause | ADVERTISED_Pause); break; case MDIO_COMBO_IEEE0_AUTO_NEG_ADV_PAUSE_ASYMMETRIC: sc->port.advertising[cfg_idx] |= ADVERTISED_Asym_Pause; break; } } static uint16_t bxe_get_mf_speed(struct bxe_softc *sc) { uint16_t line_speed = sc->link_vars.line_speed; if (IS_MF(sc)) { uint16_t maxCfg = bxe_extract_max_cfg(sc, sc->devinfo.mf_info.mf_config[SC_VN(sc)]); /* calculate the current MAX line speed limit for the MF devices */ if (IS_MF_SI(sc)) { line_speed = (line_speed * maxCfg) / 100; } else { /* SD mode */ uint16_t vn_max_rate = maxCfg * 100; if (vn_max_rate < line_speed) { line_speed = vn_max_rate; } } } return (line_speed); } static void bxe_fill_report_data(struct bxe_softc *sc, struct bxe_link_report_data *data) { uint16_t line_speed = bxe_get_mf_speed(sc); memset(data, 0, sizeof(*data)); /* fill the report data with the effective line speed */ data->line_speed = line_speed; /* Link is down */ if (!sc->link_vars.link_up || (sc->flags & BXE_MF_FUNC_DIS)) { bxe_set_bit(BXE_LINK_REPORT_LINK_DOWN, &data->link_report_flags); } /* Full DUPLEX */ if (sc->link_vars.duplex == DUPLEX_FULL) { bxe_set_bit(BXE_LINK_REPORT_FULL_DUPLEX, &data->link_report_flags); } /* Rx Flow Control is ON */ if (sc->link_vars.flow_ctrl & ELINK_FLOW_CTRL_RX) { bxe_set_bit(BXE_LINK_REPORT_RX_FC_ON, &data->link_report_flags); } /* Tx Flow Control is ON */ if (sc->link_vars.flow_ctrl & ELINK_FLOW_CTRL_TX) { bxe_set_bit(BXE_LINK_REPORT_TX_FC_ON, &data->link_report_flags); } } /* report link status to OS, should be called under phy_lock */ static void bxe_link_report_locked(struct bxe_softc *sc) { struct bxe_link_report_data cur_data; /* reread mf_cfg */ if (IS_PF(sc) && !CHIP_IS_E1(sc)) { bxe_read_mf_cfg(sc); } /* Read the current link report info */ bxe_fill_report_data(sc, &cur_data); /* Don't report link down or exactly the same link status twice */ if (!memcmp(&cur_data, &sc->last_reported_link, sizeof(cur_data)) || (bxe_test_bit(BXE_LINK_REPORT_LINK_DOWN, &sc->last_reported_link.link_report_flags) && bxe_test_bit(BXE_LINK_REPORT_LINK_DOWN, &cur_data.link_report_flags))) { return; } sc->link_cnt++; /* report new link params and remember the state for the next time */ memcpy(&sc->last_reported_link, &cur_data, sizeof(cur_data)); if (bxe_test_bit(BXE_LINK_REPORT_LINK_DOWN, &cur_data.link_report_flags)) { if_link_state_change(sc->ifp, LINK_STATE_DOWN); BLOGI(sc, "NIC Link is Down\n"); } else { const char *duplex; const char *flow; if (bxe_test_and_clear_bit(BXE_LINK_REPORT_FULL_DUPLEX, &cur_data.link_report_flags)) { duplex = "full"; } else { duplex = "half"; } /* * Handle the FC at the end so that only these flags would be * possibly set. This way we may easily check if there is no FC * enabled. */ if (cur_data.link_report_flags) { if (bxe_test_bit(BXE_LINK_REPORT_RX_FC_ON, &cur_data.link_report_flags) && bxe_test_bit(BXE_LINK_REPORT_TX_FC_ON, &cur_data.link_report_flags)) { flow = "ON - receive & transmit"; } else if (bxe_test_bit(BXE_LINK_REPORT_RX_FC_ON, &cur_data.link_report_flags) && !bxe_test_bit(BXE_LINK_REPORT_TX_FC_ON, &cur_data.link_report_flags)) { flow = "ON - receive"; } else if (!bxe_test_bit(BXE_LINK_REPORT_RX_FC_ON, &cur_data.link_report_flags) && bxe_test_bit(BXE_LINK_REPORT_TX_FC_ON, &cur_data.link_report_flags)) { flow = "ON - transmit"; } else { flow = "none"; /* possible? */ } } else { flow = "none"; } if_link_state_change(sc->ifp, LINK_STATE_UP); BLOGI(sc, "NIC Link is Up, %d Mbps %s duplex, Flow control: %s\n", cur_data.line_speed, duplex, flow); } } static void bxe_link_report(struct bxe_softc *sc) { BXE_PHY_LOCK(sc); bxe_link_report_locked(sc); BXE_PHY_UNLOCK(sc); } static void bxe_link_status_update(struct bxe_softc *sc) { if (sc->state != BXE_STATE_OPEN) { return; } #if 0 /* read updated dcb configuration */ if (IS_PF(sc)) bxe_dcbx_pmf_update(sc); #endif if (IS_PF(sc) && !CHIP_REV_IS_SLOW(sc)) { elink_link_status_update(&sc->link_params, &sc->link_vars); } else { sc->port.supported[0] |= (ELINK_SUPPORTED_10baseT_Half | ELINK_SUPPORTED_10baseT_Full | ELINK_SUPPORTED_100baseT_Half | ELINK_SUPPORTED_100baseT_Full | ELINK_SUPPORTED_1000baseT_Full | ELINK_SUPPORTED_2500baseX_Full | ELINK_SUPPORTED_10000baseT_Full | ELINK_SUPPORTED_TP | ELINK_SUPPORTED_FIBRE | ELINK_SUPPORTED_Autoneg | ELINK_SUPPORTED_Pause | ELINK_SUPPORTED_Asym_Pause); sc->port.advertising[0] = sc->port.supported[0]; sc->link_params.sc = sc; sc->link_params.port = SC_PORT(sc); sc->link_params.req_duplex[0] = DUPLEX_FULL; sc->link_params.req_flow_ctrl[0] = ELINK_FLOW_CTRL_NONE; sc->link_params.req_line_speed[0] = SPEED_10000; sc->link_params.speed_cap_mask[0] = 0x7f0000; sc->link_params.switch_cfg = ELINK_SWITCH_CFG_10G; if (CHIP_REV_IS_FPGA(sc)) { sc->link_vars.mac_type = ELINK_MAC_TYPE_EMAC; sc->link_vars.line_speed = ELINK_SPEED_1000; sc->link_vars.link_status = (LINK_STATUS_LINK_UP | LINK_STATUS_SPEED_AND_DUPLEX_1000TFD); } else { sc->link_vars.mac_type = ELINK_MAC_TYPE_BMAC; sc->link_vars.line_speed = ELINK_SPEED_10000; sc->link_vars.link_status = (LINK_STATUS_LINK_UP | LINK_STATUS_SPEED_AND_DUPLEX_10GTFD); } sc->link_vars.link_up = 1; sc->link_vars.duplex = DUPLEX_FULL; sc->link_vars.flow_ctrl = ELINK_FLOW_CTRL_NONE; if (IS_PF(sc)) { REG_WR(sc, NIG_REG_EGRESS_DRAIN0_MODE + sc->link_params.port*4, 0); bxe_stats_handle(sc, STATS_EVENT_LINK_UP); bxe_link_report(sc); } } if (IS_PF(sc)) { if (sc->link_vars.link_up) { bxe_stats_handle(sc, STATS_EVENT_LINK_UP); } else { bxe_stats_handle(sc, STATS_EVENT_STOP); } bxe_link_report(sc); } else { bxe_link_report(sc); bxe_stats_handle(sc, STATS_EVENT_LINK_UP); } } static int bxe_initial_phy_init(struct bxe_softc *sc, int load_mode) { int rc, cfg_idx = bxe_get_link_cfg_idx(sc); uint16_t req_line_speed = sc->link_params.req_line_speed[cfg_idx]; struct elink_params *lp = &sc->link_params; bxe_set_requested_fc(sc); if (CHIP_REV_IS_SLOW(sc)) { uint32_t bond = CHIP_BOND_ID(sc); uint32_t feat = 0; if (CHIP_IS_E2(sc) && CHIP_IS_MODE_4_PORT(sc)) { feat |= ELINK_FEATURE_CONFIG_EMUL_DISABLE_BMAC; } else if (bond & 0x4) { if (CHIP_IS_E3(sc)) { feat |= ELINK_FEATURE_CONFIG_EMUL_DISABLE_XMAC; } else { feat |= ELINK_FEATURE_CONFIG_EMUL_DISABLE_BMAC; } } else if (bond & 0x8) { if (CHIP_IS_E3(sc)) { feat |= ELINK_FEATURE_CONFIG_EMUL_DISABLE_UMAC; } else { feat |= ELINK_FEATURE_CONFIG_EMUL_DISABLE_EMAC; } } /* disable EMAC for E3 and above */ if (bond & 0x2) { feat |= ELINK_FEATURE_CONFIG_EMUL_DISABLE_EMAC; } sc->link_params.feature_config_flags |= feat; } BXE_PHY_LOCK(sc); if (load_mode == LOAD_DIAG) { lp->loopback_mode = ELINK_LOOPBACK_XGXS; /* Prefer doing PHY loopback at 10G speed, if possible */ if (lp->req_line_speed[cfg_idx] < ELINK_SPEED_10000) { if (lp->speed_cap_mask[cfg_idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_10G) { lp->req_line_speed[cfg_idx] = ELINK_SPEED_10000; } else { lp->req_line_speed[cfg_idx] = ELINK_SPEED_1000; } } } if (load_mode == LOAD_LOOPBACK_EXT) { lp->loopback_mode = ELINK_LOOPBACK_EXT; } rc = elink_phy_init(&sc->link_params, &sc->link_vars); BXE_PHY_UNLOCK(sc); bxe_calc_fc_adv(sc); if (sc->link_vars.link_up) { bxe_stats_handle(sc, STATS_EVENT_LINK_UP); bxe_link_report(sc); } if (!CHIP_REV_IS_SLOW(sc)) { bxe_periodic_start(sc); } sc->link_params.req_line_speed[cfg_idx] = req_line_speed; return (rc); } /* must be called under IF_ADDR_LOCK */ static int bxe_init_mcast_macs_list(struct bxe_softc *sc, struct ecore_mcast_ramrod_params *p) { if_t ifp = sc->ifp; int mc_count = 0; int mcnt, i; struct ecore_mcast_list_elem *mc_mac; unsigned char *mta; mc_count = if_multiaddr_count(ifp, -1);/* XXX they don't have a limit */ /* should we enforce one? */ ECORE_LIST_INIT(&p->mcast_list); p->mcast_list_len = 0; if (!mc_count) { return (0); } mta = malloc(sizeof(unsigned char) * ETHER_ADDR_LEN * mc_count, M_DEVBUF, M_NOWAIT); if(mta == NULL) { BLOGE(sc, "Failed to allocate temp mcast list\n"); return (-1); } mc_mac = malloc(sizeof(*mc_mac) * mc_count, M_DEVBUF, (M_NOWAIT | M_ZERO)); if (!mc_mac) { free(mta, M_DEVBUF); BLOGE(sc, "Failed to allocate temp mcast list\n"); return (-1); } if_multiaddr_array(ifp, mta, &mcnt, mc_count); /* mta and mcnt not expected to be different */ for(i=0; i< mcnt; i++) { bcopy((mta + (i * ETHER_ADDR_LEN)), mc_mac->mac, ETHER_ADDR_LEN); ECORE_LIST_PUSH_TAIL(&mc_mac->link, &p->mcast_list); BLOGD(sc, DBG_LOAD, "Setting MCAST %02X:%02X:%02X:%02X:%02X:%02X\n", mc_mac->mac[0], mc_mac->mac[1], mc_mac->mac[2], mc_mac->mac[3], mc_mac->mac[4], mc_mac->mac[5]); mc_mac++; } p->mcast_list_len = mc_count; free(mta, M_DEVBUF); return (0); } static void bxe_free_mcast_macs_list(struct ecore_mcast_ramrod_params *p) { struct ecore_mcast_list_elem *mc_mac = ECORE_LIST_FIRST_ENTRY(&p->mcast_list, struct ecore_mcast_list_elem, link); if (mc_mac) { /* only a single free as all mc_macs are in the same heap array */ free(mc_mac, M_DEVBUF); } } static int bxe_set_mc_list(struct bxe_softc *sc) { struct ecore_mcast_ramrod_params rparam = { NULL }; int rc = 0; rparam.mcast_obj = &sc->mcast_obj; BXE_MCAST_LOCK(sc); /* first, clear all configured multicast MACs */ rc = ecore_config_mcast(sc, &rparam, ECORE_MCAST_CMD_DEL); if (rc < 0) { BLOGE(sc, "Failed to clear multicast configuration: %d\n", rc); return (rc); } /* configure a new MACs list */ rc = bxe_init_mcast_macs_list(sc, &rparam); if (rc) { BLOGE(sc, "Failed to create mcast MACs list (%d)\n", rc); BXE_MCAST_UNLOCK(sc); return (rc); } /* Now add the new MACs */ rc = ecore_config_mcast(sc, &rparam, ECORE_MCAST_CMD_ADD); if (rc < 0) { BLOGE(sc, "Failed to set new mcast config (%d)\n", rc); } bxe_free_mcast_macs_list(&rparam); BXE_MCAST_UNLOCK(sc); return (rc); } static int bxe_set_uc_list(struct bxe_softc *sc) { if_t ifp = sc->ifp; struct ecore_vlan_mac_obj *mac_obj = &sc->sp_objs->mac_obj; struct ifaddr *ifa; unsigned long ramrod_flags = 0; int rc; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_addr_rlock(ifp); #endif /* first schedule a cleanup up of old configuration */ rc = bxe_del_all_macs(sc, mac_obj, ECORE_UC_LIST_MAC, FALSE); if (rc < 0) { BLOGE(sc, "Failed to schedule delete of all ETH MACs (%d)\n", rc); #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_addr_runlock(ifp); #endif return (rc); } ifa = if_getifaddr(ifp); /* XXX Is this structure */ while (ifa) { if (ifa->ifa_addr->sa_family != AF_LINK) { ifa = TAILQ_NEXT(ifa, ifa_link); continue; } rc = bxe_set_mac_one(sc, (uint8_t *)LLADDR((struct sockaddr_dl *)ifa->ifa_addr), mac_obj, TRUE, ECORE_UC_LIST_MAC, &ramrod_flags); if (rc == -EEXIST) { BLOGD(sc, DBG_SP, "Failed to schedule ADD operations (EEXIST)\n"); /* do not treat adding same MAC as an error */ rc = 0; } else if (rc < 0) { BLOGE(sc, "Failed to schedule ADD operations (%d)\n", rc); #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_addr_runlock(ifp); #endif return (rc); } ifa = TAILQ_NEXT(ifa, ifa_link); } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_addr_runlock(ifp); #endif /* Execute the pending commands */ bit_set(&ramrod_flags, RAMROD_CONT); return (bxe_set_mac_one(sc, NULL, mac_obj, FALSE /* don't care */, ECORE_UC_LIST_MAC, &ramrod_flags)); } static void bxe_handle_rx_mode_tq(void *context, int pending) { struct bxe_softc *sc = (struct bxe_softc *)context; if_t ifp = sc->ifp; uint32_t rx_mode = BXE_RX_MODE_NORMAL; BXE_CORE_LOCK(sc); if (sc->state != BXE_STATE_OPEN) { BLOGD(sc, DBG_SP, "state is %x, returning\n", sc->state); BXE_CORE_UNLOCK(sc); return; } BLOGD(sc, DBG_SP, "if_flags(ifp)=0x%x\n", if_getflags(sc->ifp)); if (if_getflags(ifp) & IFF_PROMISC) { rx_mode = BXE_RX_MODE_PROMISC; } else if ((if_getflags(ifp) & IFF_ALLMULTI) || ((if_getamcount(ifp) > BXE_MAX_MULTICAST) && CHIP_IS_E1(sc))) { rx_mode = BXE_RX_MODE_ALLMULTI; } else { if (IS_PF(sc)) { /* some multicasts */ if (bxe_set_mc_list(sc) < 0) { rx_mode = BXE_RX_MODE_ALLMULTI; } if (bxe_set_uc_list(sc) < 0) { rx_mode = BXE_RX_MODE_PROMISC; } } #if 0 else { /* * Configuring mcast to a VF involves sleeping (when we * wait for the PF's response). Since this function is * called from a non sleepable context we must schedule * a work item for this purpose */ bxe_set_bit(BXE_SP_RTNL_VFPF_MCAST, &sc->sp_rtnl_state); schedule_delayed_work(&sc->sp_rtnl_task, 0); } #endif } sc->rx_mode = rx_mode; /* schedule the rx_mode command */ if (bxe_test_bit(ECORE_FILTER_RX_MODE_PENDING, &sc->sp_state)) { BLOGD(sc, DBG_LOAD, "Scheduled setting rx_mode with ECORE...\n"); bxe_set_bit(ECORE_FILTER_RX_MODE_SCHED, &sc->sp_state); BXE_CORE_UNLOCK(sc); return; } if (IS_PF(sc)) { bxe_set_storm_rx_mode(sc); } #if 0 else { /* * Configuring mcast to a VF involves sleeping (when we * wait for the PF's response). Since this function is * called from a non sleepable context we must schedule * a work item for this purpose */ bxe_set_bit(BXE_SP_RTNL_VFPF_STORM_RX_MODE, &sc->sp_rtnl_state); schedule_delayed_work(&sc->sp_rtnl_task, 0); } #endif BXE_CORE_UNLOCK(sc); } static void bxe_set_rx_mode(struct bxe_softc *sc) { taskqueue_enqueue(sc->rx_mode_tq, &sc->rx_mode_tq_task); } /* update flags in shmem */ static void bxe_update_drv_flags(struct bxe_softc *sc, uint32_t flags, uint32_t set) { uint32_t drv_flags; if (SHMEM2_HAS(sc, drv_flags)) { bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_DRV_FLAGS); drv_flags = SHMEM2_RD(sc, drv_flags); if (set) { SET_FLAGS(drv_flags, flags); } else { RESET_FLAGS(drv_flags, flags); } SHMEM2_WR(sc, drv_flags, drv_flags); BLOGD(sc, DBG_LOAD, "drv_flags 0x%08x\n", drv_flags); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_DRV_FLAGS); } } /* periodic timer callout routine, only runs when the interface is up */ static void bxe_periodic_callout_func(void *xsc) { struct bxe_softc *sc = (struct bxe_softc *)xsc; int i; if (!BXE_CORE_TRYLOCK(sc)) { /* just bail and try again next time */ if ((sc->state == BXE_STATE_OPEN) && (atomic_load_acq_long(&sc->periodic_flags) == PERIODIC_GO)) { /* schedule the next periodic callout */ callout_reset(&sc->periodic_callout, hz, bxe_periodic_callout_func, sc); } return; } if ((sc->state != BXE_STATE_OPEN) || (atomic_load_acq_long(&sc->periodic_flags) == PERIODIC_STOP)) { BLOGW(sc, "periodic callout exit (state=0x%x)\n", sc->state); BXE_CORE_UNLOCK(sc); return; } /* Check for TX timeouts on any fastpath. */ FOR_EACH_QUEUE(sc, i) { if (bxe_watchdog(sc, &sc->fp[i]) != 0) { /* Ruh-Roh, chip was reset! */ break; } } if (!CHIP_REV_IS_SLOW(sc)) { /* * This barrier is needed to ensure the ordering between the writing * to the sc->port.pmf in the bxe_nic_load() or bxe_pmf_update() and * the reading here. */ mb(); if (sc->port.pmf) { BXE_PHY_LOCK(sc); elink_period_func(&sc->link_params, &sc->link_vars); BXE_PHY_UNLOCK(sc); } } if (IS_PF(sc) && !BXE_NOMCP(sc)) { int mb_idx = SC_FW_MB_IDX(sc); uint32_t drv_pulse; uint32_t mcp_pulse; ++sc->fw_drv_pulse_wr_seq; sc->fw_drv_pulse_wr_seq &= DRV_PULSE_SEQ_MASK; drv_pulse = sc->fw_drv_pulse_wr_seq; bxe_drv_pulse(sc); mcp_pulse = (SHMEM_RD(sc, func_mb[mb_idx].mcp_pulse_mb) & MCP_PULSE_SEQ_MASK); /* * The delta between driver pulse and mcp response should * be 1 (before mcp response) or 0 (after mcp response). */ if ((drv_pulse != mcp_pulse) && (drv_pulse != ((mcp_pulse + 1) & MCP_PULSE_SEQ_MASK))) { /* someone lost a heartbeat... */ BLOGE(sc, "drv_pulse (0x%x) != mcp_pulse (0x%x)\n", drv_pulse, mcp_pulse); } } /* state is BXE_STATE_OPEN */ bxe_stats_handle(sc, STATS_EVENT_UPDATE); #if 0 /* sample VF bulletin board for new posts from PF */ if (IS_VF(sc)) { bxe_sample_bulletin(sc); } #endif BXE_CORE_UNLOCK(sc); if ((sc->state == BXE_STATE_OPEN) && (atomic_load_acq_long(&sc->periodic_flags) == PERIODIC_GO)) { /* schedule the next periodic callout */ callout_reset(&sc->periodic_callout, hz, bxe_periodic_callout_func, sc); } } static void bxe_periodic_start(struct bxe_softc *sc) { atomic_store_rel_long(&sc->periodic_flags, PERIODIC_GO); callout_reset(&sc->periodic_callout, hz, bxe_periodic_callout_func, sc); } static void bxe_periodic_stop(struct bxe_softc *sc) { atomic_store_rel_long(&sc->periodic_flags, PERIODIC_STOP); callout_drain(&sc->periodic_callout); } /* start the controller */ static __noinline int bxe_nic_load(struct bxe_softc *sc, int load_mode) { uint32_t val; int load_code = 0; int i, rc = 0; BXE_CORE_LOCK_ASSERT(sc); BLOGD(sc, DBG_LOAD, "Starting NIC load...\n"); sc->state = BXE_STATE_OPENING_WAITING_LOAD; if (IS_PF(sc)) { /* must be called before memory allocation and HW init */ bxe_ilt_set_info(sc); } sc->last_reported_link_state = LINK_STATE_UNKNOWN; bxe_set_fp_rx_buf_size(sc); if (bxe_alloc_fp_buffers(sc) != 0) { BLOGE(sc, "Failed to allocate fastpath memory\n"); sc->state = BXE_STATE_CLOSED; rc = ENOMEM; goto bxe_nic_load_error0; } if (bxe_alloc_mem(sc) != 0) { sc->state = BXE_STATE_CLOSED; rc = ENOMEM; goto bxe_nic_load_error0; } if (bxe_alloc_fw_stats_mem(sc) != 0) { sc->state = BXE_STATE_CLOSED; rc = ENOMEM; goto bxe_nic_load_error0; } if (IS_PF(sc)) { /* set pf load just before approaching the MCP */ bxe_set_pf_load(sc); /* if MCP exists send load request and analyze response */ if (!BXE_NOMCP(sc)) { /* attempt to load pf */ if (bxe_nic_load_request(sc, &load_code) != 0) { sc->state = BXE_STATE_CLOSED; rc = ENXIO; goto bxe_nic_load_error1; } /* what did the MCP say? */ if (bxe_nic_load_analyze_req(sc, load_code) != 0) { bxe_fw_command(sc, DRV_MSG_CODE_LOAD_DONE, 0); sc->state = BXE_STATE_CLOSED; rc = ENXIO; goto bxe_nic_load_error2; } } else { BLOGI(sc, "Device has no MCP!\n"); load_code = bxe_nic_load_no_mcp(sc); } /* mark PMF if applicable */ bxe_nic_load_pmf(sc, load_code); /* Init Function state controlling object */ bxe_init_func_obj(sc); /* Initialize HW */ if (bxe_init_hw(sc, load_code) != 0) { BLOGE(sc, "HW init failed\n"); bxe_fw_command(sc, DRV_MSG_CODE_LOAD_DONE, 0); sc->state = BXE_STATE_CLOSED; rc = ENXIO; goto bxe_nic_load_error2; } } /* attach interrupts */ if (bxe_interrupt_attach(sc) != 0) { sc->state = BXE_STATE_CLOSED; rc = ENXIO; goto bxe_nic_load_error2; } bxe_nic_init(sc, load_code); /* Init per-function objects */ if (IS_PF(sc)) { bxe_init_objs(sc); // XXX bxe_iov_nic_init(sc); /* set AFEX default VLAN tag to an invalid value */ sc->devinfo.mf_info.afex_def_vlan_tag = -1; // XXX bxe_nic_load_afex_dcc(sc, load_code); sc->state = BXE_STATE_OPENING_WAITING_PORT; rc = bxe_func_start(sc); if (rc) { BLOGE(sc, "Function start failed!\n"); bxe_fw_command(sc, DRV_MSG_CODE_LOAD_DONE, 0); sc->state = BXE_STATE_ERROR; goto bxe_nic_load_error3; } /* send LOAD_DONE command to MCP */ if (!BXE_NOMCP(sc)) { load_code = bxe_fw_command(sc, DRV_MSG_CODE_LOAD_DONE, 0); if (!load_code) { BLOGE(sc, "MCP response failure, aborting\n"); sc->state = BXE_STATE_ERROR; rc = ENXIO; goto bxe_nic_load_error3; } } rc = bxe_setup_leading(sc); if (rc) { BLOGE(sc, "Setup leading failed!\n"); sc->state = BXE_STATE_ERROR; goto bxe_nic_load_error3; } FOR_EACH_NONDEFAULT_ETH_QUEUE(sc, i) { rc = bxe_setup_queue(sc, &sc->fp[i], FALSE); if (rc) { BLOGE(sc, "Queue(%d) setup failed\n", i); sc->state = BXE_STATE_ERROR; goto bxe_nic_load_error3; } } rc = bxe_init_rss_pf(sc); if (rc) { BLOGE(sc, "PF RSS init failed\n"); sc->state = BXE_STATE_ERROR; goto bxe_nic_load_error3; } } /* XXX VF */ #if 0 else { /* VF */ FOR_EACH_ETH_QUEUE(sc, i) { rc = bxe_vfpf_setup_q(sc, i); if (rc) { BLOGE(sc, "Queue(%d) setup failed\n", i); sc->state = BXE_STATE_ERROR; goto bxe_nic_load_error3; } } } #endif /* now when Clients are configured we are ready to work */ sc->state = BXE_STATE_OPEN; /* Configure a ucast MAC */ if (IS_PF(sc)) { rc = bxe_set_eth_mac(sc, TRUE); } #if 0 else { /* IS_VF(sc) */ rc = bxe_vfpf_set_mac(sc); } #endif if (rc) { BLOGE(sc, "Setting Ethernet MAC failed\n"); sc->state = BXE_STATE_ERROR; goto bxe_nic_load_error3; } #if 0 if (IS_PF(sc) && sc->pending_max) { /* for AFEX */ bxe_update_max_mf_config(sc, sc->pending_max); sc->pending_max = 0; } #endif if (sc->port.pmf) { rc = bxe_initial_phy_init(sc, /* XXX load_mode */LOAD_OPEN); if (rc) { sc->state = BXE_STATE_ERROR; goto bxe_nic_load_error3; } } sc->link_params.feature_config_flags &= ~ELINK_FEATURE_CONFIG_BOOT_FROM_SAN; /* start fast path */ /* Initialize Rx filter */ bxe_set_rx_mode(sc); /* start the Tx */ switch (/* XXX load_mode */LOAD_OPEN) { case LOAD_NORMAL: case LOAD_OPEN: break; case LOAD_DIAG: case LOAD_LOOPBACK_EXT: sc->state = BXE_STATE_DIAG; break; default: break; } if (sc->port.pmf) { bxe_update_drv_flags(sc, 1 << DRV_FLAGS_PORT_MASK, 0); } else { bxe_link_status_update(sc); } /* start the periodic timer callout */ bxe_periodic_start(sc); if (IS_PF(sc) && SHMEM2_HAS(sc, drv_capabilities_flag)) { /* mark driver is loaded in shmem2 */ val = SHMEM2_RD(sc, drv_capabilities_flag[SC_FW_MB_IDX(sc)]); SHMEM2_WR(sc, drv_capabilities_flag[SC_FW_MB_IDX(sc)], (val | DRV_FLAGS_CAPABILITIES_LOADED_SUPPORTED | DRV_FLAGS_CAPABILITIES_LOADED_L2)); } /* wait for all pending SP commands to complete */ if (IS_PF(sc) && !bxe_wait_sp_comp(sc, ~0x0UL)) { BLOGE(sc, "Timeout waiting for all SPs to complete!\n"); bxe_periodic_stop(sc); bxe_nic_unload(sc, UNLOAD_CLOSE, FALSE); return (ENXIO); } #if 0 /* If PMF - send ADMIN DCBX msg to MFW to initiate DCBX FSM */ if (sc->port.pmf && (sc->state != BXE_STATE_DIAG)) { bxe_dcbx_init(sc, FALSE); } #endif /* Tell the stack the driver is running! */ if_setdrvflags(sc->ifp, IFF_DRV_RUNNING); BLOGD(sc, DBG_LOAD, "NIC successfully loaded\n"); return (0); bxe_nic_load_error3: if (IS_PF(sc)) { bxe_int_disable_sync(sc, 1); /* clean out queued objects */ bxe_squeeze_objects(sc); } bxe_interrupt_detach(sc); bxe_nic_load_error2: if (IS_PF(sc) && !BXE_NOMCP(sc)) { bxe_fw_command(sc, DRV_MSG_CODE_UNLOAD_REQ_WOL_MCP, 0); bxe_fw_command(sc, DRV_MSG_CODE_UNLOAD_DONE, 0); } sc->port.pmf = 0; bxe_nic_load_error1: /* clear pf_load status, as it was already set */ if (IS_PF(sc)) { bxe_clear_pf_load(sc); } bxe_nic_load_error0: bxe_free_fw_stats_mem(sc); bxe_free_fp_buffers(sc); bxe_free_mem(sc); return (rc); } static int bxe_init_locked(struct bxe_softc *sc) { int other_engine = SC_PATH(sc) ? 0 : 1; uint8_t other_load_status, load_status; uint8_t global = FALSE; int rc; BXE_CORE_LOCK_ASSERT(sc); /* check if the driver is already running */ if (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING) { BLOGD(sc, DBG_LOAD, "Init called while driver is running!\n"); return (0); } bxe_set_power_state(sc, PCI_PM_D0); /* * If parity occurred during the unload, then attentions and/or * RECOVERY_IN_PROGRES may still be set. If so we want the first function * loaded on the current engine to complete the recovery. Parity recovery * is only relevant for PF driver. */ if (IS_PF(sc)) { other_load_status = bxe_get_load_status(sc, other_engine); load_status = bxe_get_load_status(sc, SC_PATH(sc)); if (!bxe_reset_is_done(sc, SC_PATH(sc)) || bxe_chk_parity_attn(sc, &global, TRUE)) { do { /* * If there are attentions and they are in global blocks, set * the GLOBAL_RESET bit regardless whether it will be this * function that will complete the recovery or not. */ if (global) { bxe_set_reset_global(sc); } /* * Only the first function on the current engine should try * to recover in open. In case of attentions in global blocks * only the first in the chip should try to recover. */ if ((!load_status && (!global || !other_load_status)) && bxe_trylock_leader_lock(sc) && !bxe_leader_reset(sc)) { BLOGI(sc, "Recovered during init\n"); break; } /* recovery has failed... */ bxe_set_power_state(sc, PCI_PM_D3hot); sc->recovery_state = BXE_RECOVERY_FAILED; BLOGE(sc, "Recovery flow hasn't properly " "completed yet, try again later. " "If you still see this message after a " "few retries then power cycle is required.\n"); rc = ENXIO; goto bxe_init_locked_done; } while (0); } } sc->recovery_state = BXE_RECOVERY_DONE; rc = bxe_nic_load(sc, LOAD_OPEN); bxe_init_locked_done: if (rc) { /* Tell the stack the driver is NOT running! */ BLOGE(sc, "Initialization failed, " "stack notified driver is NOT running!\n"); if_setdrvflagbits(sc->ifp, 0, IFF_DRV_RUNNING); } return (rc); } static int bxe_stop_locked(struct bxe_softc *sc) { BXE_CORE_LOCK_ASSERT(sc); return (bxe_nic_unload(sc, UNLOAD_NORMAL, TRUE)); } /* * Handles controller initialization when called from an unlocked routine. * ifconfig calls this function. * * Returns: * void */ static void bxe_init(void *xsc) { struct bxe_softc *sc = (struct bxe_softc *)xsc; BXE_CORE_LOCK(sc); bxe_init_locked(sc); BXE_CORE_UNLOCK(sc); } static int bxe_init_ifnet(struct bxe_softc *sc) { if_t ifp; int capabilities; /* ifconfig entrypoint for media type/status reporting */ ifmedia_init(&sc->ifmedia, IFM_IMASK, bxe_ifmedia_update, bxe_ifmedia_status); /* set the default interface values */ ifmedia_add(&sc->ifmedia, (IFM_ETHER | IFM_FDX | sc->media), 0, NULL); ifmedia_add(&sc->ifmedia, (IFM_ETHER | IFM_AUTO), 0, NULL); ifmedia_set(&sc->ifmedia, (IFM_ETHER | IFM_AUTO)); sc->ifmedia.ifm_media = sc->ifmedia.ifm_cur->ifm_media; /* XXX ? */ /* allocate the ifnet structure */ if ((ifp = if_gethandle(IFT_ETHER)) == NULL) { BLOGE(sc, "Interface allocation failed!\n"); return (ENXIO); } if_setsoftc(ifp, sc); if_initname(ifp, device_get_name(sc->dev), device_get_unit(sc->dev)); if_setflags(ifp, (IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST)); if_setioctlfn(ifp, bxe_ioctl); if_setstartfn(ifp, bxe_tx_start); if_setgetcounterfn(ifp, bxe_get_counter); #if __FreeBSD_version >= 800000 if_settransmitfn(ifp, bxe_tx_mq_start); if_setqflushfn(ifp, bxe_mq_flush); #endif #ifdef FreeBSD8_0 if_settimer(ifp, 0); #endif if_setinitfn(ifp, bxe_init); if_setmtu(ifp, sc->mtu); if_sethwassist(ifp, (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_TSO | CSUM_TCP_IPV6 | CSUM_UDP_IPV6)); capabilities = #if __FreeBSD_version < 700000 (IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_HWCSUM | IFCAP_JUMBO_MTU | IFCAP_LRO); #else (IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWCSUM | IFCAP_HWCSUM | IFCAP_JUMBO_MTU | IFCAP_LRO | IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_WOL_MAGIC); #endif if_setcapabilitiesbit(ifp, capabilities, 0); /* XXX */ if_setbaudrate(ifp, IF_Gbps(10)); /* XXX */ if_setsendqlen(ifp, sc->tx_ring_size); if_setsendqready(ifp); /* XXX */ sc->ifp = ifp; /* attach to the Ethernet interface list */ ether_ifattach(ifp, sc->link_params.mac_addr); return (0); } static void bxe_deallocate_bars(struct bxe_softc *sc) { int i; for (i = 0; i < MAX_BARS; i++) { if (sc->bar[i].resource != NULL) { bus_release_resource(sc->dev, SYS_RES_MEMORY, sc->bar[i].rid, sc->bar[i].resource); BLOGD(sc, DBG_LOAD, "Released PCI BAR%d [%02x] memory\n", i, PCIR_BAR(i)); } } } static int bxe_allocate_bars(struct bxe_softc *sc) { u_int flags; int i; memset(sc->bar, 0, sizeof(sc->bar)); for (i = 0; i < MAX_BARS; i++) { /* memory resources reside at BARs 0, 2, 4 */ /* Run `pciconf -lb` to see mappings */ if ((i != 0) && (i != 2) && (i != 4)) { continue; } sc->bar[i].rid = PCIR_BAR(i); flags = RF_ACTIVE; if (i == 0) { flags |= RF_SHAREABLE; } if ((sc->bar[i].resource = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->bar[i].rid, flags)) == NULL) { #if 0 /* BAR4 doesn't exist for E1 */ BLOGE(sc, "PCI BAR%d [%02x] memory allocation failed\n", i, PCIR_BAR(i)); #endif return (0); } sc->bar[i].tag = rman_get_bustag(sc->bar[i].resource); sc->bar[i].handle = rman_get_bushandle(sc->bar[i].resource); sc->bar[i].kva = (vm_offset_t)rman_get_virtual(sc->bar[i].resource); BLOGI(sc, "PCI BAR%d [%02x] memory allocated: %p-%p (%ld) -> %p\n", i, PCIR_BAR(i), (void *)rman_get_start(sc->bar[i].resource), (void *)rman_get_end(sc->bar[i].resource), rman_get_size(sc->bar[i].resource), (void *)sc->bar[i].kva); } return (0); } static void bxe_get_function_num(struct bxe_softc *sc) { uint32_t val = 0; /* * Read the ME register to get the function number. The ME register * holds the relative-function number and absolute-function number. The * absolute-function number appears only in E2 and above. Before that * these bits always contained zero, therefore we cannot blindly use them. */ val = REG_RD(sc, BAR_ME_REGISTER); sc->pfunc_rel = (uint8_t)((val & ME_REG_PF_NUM) >> ME_REG_PF_NUM_SHIFT); sc->path_id = (uint8_t)((val & ME_REG_ABS_PF_NUM) >> ME_REG_ABS_PF_NUM_SHIFT) & 1; if (CHIP_PORT_MODE(sc) == CHIP_4_PORT_MODE) { sc->pfunc_abs = ((sc->pfunc_rel << 1) | sc->path_id); } else { sc->pfunc_abs = (sc->pfunc_rel | sc->path_id); } BLOGD(sc, DBG_LOAD, "Relative function %d, Absolute function %d, Path %d\n", sc->pfunc_rel, sc->pfunc_abs, sc->path_id); } static uint32_t bxe_get_shmem_mf_cfg_base(struct bxe_softc *sc) { uint32_t shmem2_size; uint32_t offset; uint32_t mf_cfg_offset_value; /* Non 57712 */ offset = (SHMEM_RD(sc, func_mb) + (MAX_FUNC_NUM * sizeof(struct drv_func_mb))); /* 57712 plus */ if (sc->devinfo.shmem2_base != 0) { shmem2_size = SHMEM2_RD(sc, size); if (shmem2_size > offsetof(struct shmem2_region, mf_cfg_addr)) { mf_cfg_offset_value = SHMEM2_RD(sc, mf_cfg_addr); if (SHMEM_MF_CFG_ADDR_NONE != mf_cfg_offset_value) { offset = mf_cfg_offset_value; } } } return (offset); } static uint32_t bxe_pcie_capability_read(struct bxe_softc *sc, int reg, int width) { int pcie_reg; /* ensure PCIe capability is enabled */ if (pci_find_cap(sc->dev, PCIY_EXPRESS, &pcie_reg) == 0) { if (pcie_reg != 0) { BLOGD(sc, DBG_LOAD, "PCIe capability at 0x%04x\n", pcie_reg); return (pci_read_config(sc->dev, (pcie_reg + reg), width)); } } BLOGE(sc, "PCIe capability NOT FOUND!!!\n"); return (0); } static uint8_t bxe_is_pcie_pending(struct bxe_softc *sc) { return (bxe_pcie_capability_read(sc, PCIR_EXPRESS_DEVICE_STA, 2) & PCIM_EXP_STA_TRANSACTION_PND); } /* * Walk the PCI capabiites list for the device to find what features are * supported. These capabilites may be enabled/disabled by firmware so it's * best to walk the list rather than make assumptions. */ static void bxe_probe_pci_caps(struct bxe_softc *sc) { uint16_t link_status; int reg; /* check if PCI Power Management is enabled */ if (pci_find_cap(sc->dev, PCIY_PMG, ®) == 0) { if (reg != 0) { BLOGD(sc, DBG_LOAD, "Found PM capability at 0x%04x\n", reg); sc->devinfo.pcie_cap_flags |= BXE_PM_CAPABLE_FLAG; sc->devinfo.pcie_pm_cap_reg = (uint16_t)reg; } } link_status = bxe_pcie_capability_read(sc, PCIR_EXPRESS_LINK_STA, 2); /* handle PCIe 2.0 workarounds for 57710 */ if (CHIP_IS_E1(sc)) { /* workaround for 57710 errata E4_57710_27462 */ sc->devinfo.pcie_link_speed = (REG_RD(sc, 0x3d04) & (1 << 24)) ? 2 : 1; /* workaround for 57710 errata E4_57710_27488 */ sc->devinfo.pcie_link_width = ((link_status & PCIM_LINK_STA_WIDTH) >> 4); if (sc->devinfo.pcie_link_speed > 1) { sc->devinfo.pcie_link_width = ((link_status & PCIM_LINK_STA_WIDTH) >> 4) >> 1; } } else { sc->devinfo.pcie_link_speed = (link_status & PCIM_LINK_STA_SPEED); sc->devinfo.pcie_link_width = ((link_status & PCIM_LINK_STA_WIDTH) >> 4); } BLOGD(sc, DBG_LOAD, "PCIe link speed=%d width=%d\n", sc->devinfo.pcie_link_speed, sc->devinfo.pcie_link_width); sc->devinfo.pcie_cap_flags |= BXE_PCIE_CAPABLE_FLAG; sc->devinfo.pcie_pcie_cap_reg = (uint16_t)reg; /* check if MSI capability is enabled */ if (pci_find_cap(sc->dev, PCIY_MSI, ®) == 0) { if (reg != 0) { BLOGD(sc, DBG_LOAD, "Found MSI capability at 0x%04x\n", reg); sc->devinfo.pcie_cap_flags |= BXE_MSI_CAPABLE_FLAG; sc->devinfo.pcie_msi_cap_reg = (uint16_t)reg; } } /* check if MSI-X capability is enabled */ if (pci_find_cap(sc->dev, PCIY_MSIX, ®) == 0) { if (reg != 0) { BLOGD(sc, DBG_LOAD, "Found MSI-X capability at 0x%04x\n", reg); sc->devinfo.pcie_cap_flags |= BXE_MSIX_CAPABLE_FLAG; sc->devinfo.pcie_msix_cap_reg = (uint16_t)reg; } } } static int bxe_get_shmem_mf_cfg_info_sd(struct bxe_softc *sc) { struct bxe_mf_info *mf_info = &sc->devinfo.mf_info; uint32_t val; /* get the outer vlan if we're in switch-dependent mode */ val = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].e1hov_tag); mf_info->ext_id = (uint16_t)val; mf_info->multi_vnics_mode = 1; if (!VALID_OVLAN(mf_info->ext_id)) { BLOGE(sc, "Invalid VLAN (%d)\n", mf_info->ext_id); return (1); } /* get the capabilities */ if ((mf_info->mf_config[SC_VN(sc)] & FUNC_MF_CFG_PROTOCOL_MASK) == FUNC_MF_CFG_PROTOCOL_ISCSI) { mf_info->mf_protos_supported |= MF_PROTO_SUPPORT_ISCSI; } else if ((mf_info->mf_config[SC_VN(sc)] & FUNC_MF_CFG_PROTOCOL_MASK) == FUNC_MF_CFG_PROTOCOL_FCOE) { mf_info->mf_protos_supported |= MF_PROTO_SUPPORT_FCOE; } else { mf_info->mf_protos_supported |= MF_PROTO_SUPPORT_ETHERNET; } mf_info->vnics_per_port = (CHIP_PORT_MODE(sc) == CHIP_4_PORT_MODE) ? 2 : 4; return (0); } static uint32_t bxe_get_shmem_ext_proto_support_flags(struct bxe_softc *sc) { uint32_t retval = 0; uint32_t val; val = MFCFG_RD(sc, func_ext_config[SC_ABS_FUNC(sc)].func_cfg); if (val & MACP_FUNC_CFG_FLAGS_ENABLED) { if (val & MACP_FUNC_CFG_FLAGS_ETHERNET) { retval |= MF_PROTO_SUPPORT_ETHERNET; } if (val & MACP_FUNC_CFG_FLAGS_ISCSI_OFFLOAD) { retval |= MF_PROTO_SUPPORT_ISCSI; } if (val & MACP_FUNC_CFG_FLAGS_FCOE_OFFLOAD) { retval |= MF_PROTO_SUPPORT_FCOE; } } return (retval); } static int bxe_get_shmem_mf_cfg_info_si(struct bxe_softc *sc) { struct bxe_mf_info *mf_info = &sc->devinfo.mf_info; uint32_t val; /* * There is no outer vlan if we're in switch-independent mode. * If the mac is valid then assume multi-function. */ val = MFCFG_RD(sc, func_ext_config[SC_ABS_FUNC(sc)].func_cfg); mf_info->multi_vnics_mode = ((val & MACP_FUNC_CFG_FLAGS_MASK) != 0); mf_info->mf_protos_supported = bxe_get_shmem_ext_proto_support_flags(sc); mf_info->vnics_per_port = (CHIP_PORT_MODE(sc) == CHIP_4_PORT_MODE) ? 2 : 4; return (0); } static int bxe_get_shmem_mf_cfg_info_niv(struct bxe_softc *sc) { struct bxe_mf_info *mf_info = &sc->devinfo.mf_info; uint32_t e1hov_tag; uint32_t func_config; uint32_t niv_config; mf_info->multi_vnics_mode = 1; e1hov_tag = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].e1hov_tag); func_config = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].config); niv_config = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].afex_config); mf_info->ext_id = (uint16_t)((e1hov_tag & FUNC_MF_CFG_E1HOV_TAG_MASK) >> FUNC_MF_CFG_E1HOV_TAG_SHIFT); mf_info->default_vlan = (uint16_t)((e1hov_tag & FUNC_MF_CFG_AFEX_VLAN_MASK) >> FUNC_MF_CFG_AFEX_VLAN_SHIFT); mf_info->niv_allowed_priorities = (uint8_t)((niv_config & FUNC_MF_CFG_AFEX_COS_FILTER_MASK) >> FUNC_MF_CFG_AFEX_COS_FILTER_SHIFT); mf_info->niv_default_cos = (uint8_t)((func_config & FUNC_MF_CFG_TRANSMIT_PRIORITY_MASK) >> FUNC_MF_CFG_TRANSMIT_PRIORITY_SHIFT); mf_info->afex_vlan_mode = ((niv_config & FUNC_MF_CFG_AFEX_VLAN_MODE_MASK) >> FUNC_MF_CFG_AFEX_VLAN_MODE_SHIFT); mf_info->niv_mba_enabled = ((niv_config & FUNC_MF_CFG_AFEX_MBA_ENABLED_MASK) >> FUNC_MF_CFG_AFEX_MBA_ENABLED_SHIFT); mf_info->mf_protos_supported = bxe_get_shmem_ext_proto_support_flags(sc); mf_info->vnics_per_port = (CHIP_PORT_MODE(sc) == CHIP_4_PORT_MODE) ? 2 : 4; return (0); } static int bxe_check_valid_mf_cfg(struct bxe_softc *sc) { struct bxe_mf_info *mf_info = &sc->devinfo.mf_info; uint32_t mf_cfg1; uint32_t mf_cfg2; uint32_t ovlan1; uint32_t ovlan2; uint8_t i, j; BLOGD(sc, DBG_LOAD, "MF config parameters for function %d\n", SC_PORT(sc)); BLOGD(sc, DBG_LOAD, "\tmf_config=0x%x\n", mf_info->mf_config[SC_VN(sc)]); BLOGD(sc, DBG_LOAD, "\tmulti_vnics_mode=%d\n", mf_info->multi_vnics_mode); BLOGD(sc, DBG_LOAD, "\tvnics_per_port=%d\n", mf_info->vnics_per_port); BLOGD(sc, DBG_LOAD, "\tovlan/vifid=%d\n", mf_info->ext_id); BLOGD(sc, DBG_LOAD, "\tmin_bw=%d/%d/%d/%d\n", mf_info->min_bw[0], mf_info->min_bw[1], mf_info->min_bw[2], mf_info->min_bw[3]); BLOGD(sc, DBG_LOAD, "\tmax_bw=%d/%d/%d/%d\n", mf_info->max_bw[0], mf_info->max_bw[1], mf_info->max_bw[2], mf_info->max_bw[3]); BLOGD(sc, DBG_LOAD, "\tmac_addr: %s\n", sc->mac_addr_str); /* various MF mode sanity checks... */ if (mf_info->mf_config[SC_VN(sc)] & FUNC_MF_CFG_FUNC_HIDE) { BLOGE(sc, "Enumerated function %d is marked as hidden\n", SC_PORT(sc)); return (1); } if ((mf_info->vnics_per_port > 1) && !mf_info->multi_vnics_mode) { BLOGE(sc, "vnics_per_port=%d multi_vnics_mode=%d\n", mf_info->vnics_per_port, mf_info->multi_vnics_mode); return (1); } if (mf_info->mf_mode == MULTI_FUNCTION_SD) { /* vnic id > 0 must have valid ovlan in switch-dependent mode */ if ((SC_VN(sc) > 0) && !VALID_OVLAN(OVLAN(sc))) { BLOGE(sc, "mf_mode=SD vnic_id=%d ovlan=%d\n", SC_VN(sc), OVLAN(sc)); return (1); } if (!VALID_OVLAN(OVLAN(sc)) && mf_info->multi_vnics_mode) { BLOGE(sc, "mf_mode=SD multi_vnics_mode=%d ovlan=%d\n", mf_info->multi_vnics_mode, OVLAN(sc)); return (1); } /* * Verify all functions are either MF or SF mode. If MF, make sure * sure that all non-hidden functions have a valid ovlan. If SF, * make sure that all non-hidden functions have an invalid ovlan. */ FOREACH_ABS_FUNC_IN_PORT(sc, i) { mf_cfg1 = MFCFG_RD(sc, func_mf_config[i].config); ovlan1 = MFCFG_RD(sc, func_mf_config[i].e1hov_tag); if (!(mf_cfg1 & FUNC_MF_CFG_FUNC_HIDE) && (((mf_info->multi_vnics_mode) && !VALID_OVLAN(ovlan1)) || ((!mf_info->multi_vnics_mode) && VALID_OVLAN(ovlan1)))) { BLOGE(sc, "mf_mode=SD function %d MF config " "mismatch, multi_vnics_mode=%d ovlan=%d\n", i, mf_info->multi_vnics_mode, ovlan1); return (1); } } /* Verify all funcs on the same port each have a different ovlan. */ FOREACH_ABS_FUNC_IN_PORT(sc, i) { mf_cfg1 = MFCFG_RD(sc, func_mf_config[i].config); ovlan1 = MFCFG_RD(sc, func_mf_config[i].e1hov_tag); /* iterate from the next function on the port to the max func */ for (j = i + 2; j < MAX_FUNC_NUM; j += 2) { mf_cfg2 = MFCFG_RD(sc, func_mf_config[j].config); ovlan2 = MFCFG_RD(sc, func_mf_config[j].e1hov_tag); if (!(mf_cfg1 & FUNC_MF_CFG_FUNC_HIDE) && VALID_OVLAN(ovlan1) && !(mf_cfg2 & FUNC_MF_CFG_FUNC_HIDE) && VALID_OVLAN(ovlan2) && (ovlan1 == ovlan2)) { BLOGE(sc, "mf_mode=SD functions %d and %d " "have the same ovlan (%d)\n", i, j, ovlan1); return (1); } } } } /* MULTI_FUNCTION_SD */ return (0); } static int bxe_get_mf_cfg_info(struct bxe_softc *sc) { struct bxe_mf_info *mf_info = &sc->devinfo.mf_info; uint32_t val, mac_upper; uint8_t i, vnic; /* initialize mf_info defaults */ mf_info->vnics_per_port = 1; mf_info->multi_vnics_mode = FALSE; mf_info->path_has_ovlan = FALSE; mf_info->mf_mode = SINGLE_FUNCTION; if (!CHIP_IS_MF_CAP(sc)) { return (0); } if (sc->devinfo.mf_cfg_base == SHMEM_MF_CFG_ADDR_NONE) { BLOGE(sc, "Invalid mf_cfg_base!\n"); return (1); } /* get the MF mode (switch dependent / independent / single-function) */ val = SHMEM_RD(sc, dev_info.shared_feature_config.config); switch (val & SHARED_FEAT_CFG_FORCE_SF_MODE_MASK) { case SHARED_FEAT_CFG_FORCE_SF_MODE_SWITCH_INDEPT: mac_upper = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].mac_upper); /* check for legal upper mac bytes */ if (mac_upper != FUNC_MF_CFG_UPPERMAC_DEFAULT) { mf_info->mf_mode = MULTI_FUNCTION_SI; } else { BLOGE(sc, "Invalid config for Switch Independent mode\n"); } break; case SHARED_FEAT_CFG_FORCE_SF_MODE_MF_ALLOWED: case SHARED_FEAT_CFG_FORCE_SF_MODE_SPIO4: /* get outer vlan configuration */ val = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].e1hov_tag); if ((val & FUNC_MF_CFG_E1HOV_TAG_MASK) != FUNC_MF_CFG_E1HOV_TAG_DEFAULT) { mf_info->mf_mode = MULTI_FUNCTION_SD; } else { BLOGE(sc, "Invalid config for Switch Dependent mode\n"); } break; case SHARED_FEAT_CFG_FORCE_SF_MODE_FORCED_SF: /* not in MF mode, vnics_per_port=1 and multi_vnics_mode=FALSE */ return (0); case SHARED_FEAT_CFG_FORCE_SF_MODE_AFEX_MODE: /* * Mark MF mode as NIV if MCP version includes NPAR-SD support * and the MAC address is valid. */ mac_upper = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].mac_upper); if ((SHMEM2_HAS(sc, afex_driver_support)) && (mac_upper != FUNC_MF_CFG_UPPERMAC_DEFAULT)) { mf_info->mf_mode = MULTI_FUNCTION_AFEX; } else { BLOGE(sc, "Invalid config for AFEX mode\n"); } break; default: BLOGE(sc, "Unknown MF mode (0x%08x)\n", (val & SHARED_FEAT_CFG_FORCE_SF_MODE_MASK)); return (1); } /* set path mf_mode (which could be different than function mf_mode) */ if (mf_info->mf_mode == MULTI_FUNCTION_SD) { mf_info->path_has_ovlan = TRUE; } else if (mf_info->mf_mode == SINGLE_FUNCTION) { /* * Decide on path multi vnics mode. If we're not in MF mode and in * 4-port mode, this is good enough to check vnic-0 of the other port * on the same path */ if (CHIP_PORT_MODE(sc) == CHIP_4_PORT_MODE) { uint8_t other_port = !(PORT_ID(sc) & 1); uint8_t abs_func_other_port = (SC_PATH(sc) + (2 * other_port)); val = MFCFG_RD(sc, func_mf_config[abs_func_other_port].e1hov_tag); mf_info->path_has_ovlan = VALID_OVLAN((uint16_t)val) ? 1 : 0; } } if (mf_info->mf_mode == SINGLE_FUNCTION) { /* invalid MF config */ if (SC_VN(sc) >= 1) { BLOGE(sc, "VNIC ID >= 1 in SF mode\n"); return (1); } return (0); } /* get the MF configuration */ mf_info->mf_config[SC_VN(sc)] = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].config); switch(mf_info->mf_mode) { case MULTI_FUNCTION_SD: bxe_get_shmem_mf_cfg_info_sd(sc); break; case MULTI_FUNCTION_SI: bxe_get_shmem_mf_cfg_info_si(sc); break; case MULTI_FUNCTION_AFEX: bxe_get_shmem_mf_cfg_info_niv(sc); break; default: BLOGE(sc, "Get MF config failed (mf_mode=0x%08x)\n", mf_info->mf_mode); return (1); } /* get the congestion management parameters */ vnic = 0; FOREACH_ABS_FUNC_IN_PORT(sc, i) { /* get min/max bw */ val = MFCFG_RD(sc, func_mf_config[i].config); mf_info->min_bw[vnic] = ((val & FUNC_MF_CFG_MIN_BW_MASK) >> FUNC_MF_CFG_MIN_BW_SHIFT); mf_info->max_bw[vnic] = ((val & FUNC_MF_CFG_MAX_BW_MASK) >> FUNC_MF_CFG_MAX_BW_SHIFT); vnic++; } return (bxe_check_valid_mf_cfg(sc)); } static int bxe_get_shmem_info(struct bxe_softc *sc) { int port; uint32_t mac_hi, mac_lo, val; port = SC_PORT(sc); mac_hi = mac_lo = 0; sc->link_params.sc = sc; sc->link_params.port = port; /* get the hardware config info */ sc->devinfo.hw_config = SHMEM_RD(sc, dev_info.shared_hw_config.config); sc->devinfo.hw_config2 = SHMEM_RD(sc, dev_info.shared_hw_config.config2); sc->link_params.hw_led_mode = ((sc->devinfo.hw_config & SHARED_HW_CFG_LED_MODE_MASK) >> SHARED_HW_CFG_LED_MODE_SHIFT); /* get the port feature config */ sc->port.config = SHMEM_RD(sc, dev_info.port_feature_config[port].config), /* get the link params */ sc->link_params.speed_cap_mask[0] = SHMEM_RD(sc, dev_info.port_hw_config[port].speed_capability_mask); sc->link_params.speed_cap_mask[1] = SHMEM_RD(sc, dev_info.port_hw_config[port].speed_capability_mask2); /* get the lane config */ sc->link_params.lane_config = SHMEM_RD(sc, dev_info.port_hw_config[port].lane_config); /* get the link config */ val = SHMEM_RD(sc, dev_info.port_feature_config[port].link_config); sc->port.link_config[ELINK_INT_PHY] = val; sc->link_params.switch_cfg = (val & PORT_FEATURE_CONNECTED_SWITCH_MASK); sc->port.link_config[ELINK_EXT_PHY1] = SHMEM_RD(sc, dev_info.port_feature_config[port].link_config2); /* get the override preemphasis flag and enable it or turn it off */ val = SHMEM_RD(sc, dev_info.shared_feature_config.config); if (val & SHARED_FEAT_CFG_OVERRIDE_PREEMPHASIS_CFG_ENABLED) { sc->link_params.feature_config_flags |= ELINK_FEATURE_CONFIG_OVERRIDE_PREEMPHASIS_ENABLED; } else { sc->link_params.feature_config_flags &= ~ELINK_FEATURE_CONFIG_OVERRIDE_PREEMPHASIS_ENABLED; } /* get the initial value of the link params */ sc->link_params.multi_phy_config = SHMEM_RD(sc, dev_info.port_hw_config[port].multi_phy_config); /* get external phy info */ sc->port.ext_phy_config = SHMEM_RD(sc, dev_info.port_hw_config[port].external_phy_config); /* get the multifunction configuration */ bxe_get_mf_cfg_info(sc); /* get the mac address */ if (IS_MF(sc)) { mac_hi = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].mac_upper); mac_lo = MFCFG_RD(sc, func_mf_config[SC_ABS_FUNC(sc)].mac_lower); } else { mac_hi = SHMEM_RD(sc, dev_info.port_hw_config[port].mac_upper); mac_lo = SHMEM_RD(sc, dev_info.port_hw_config[port].mac_lower); } if ((mac_lo == 0) && (mac_hi == 0)) { *sc->mac_addr_str = 0; BLOGE(sc, "No Ethernet address programmed!\n"); } else { sc->link_params.mac_addr[0] = (uint8_t)(mac_hi >> 8); sc->link_params.mac_addr[1] = (uint8_t)(mac_hi); sc->link_params.mac_addr[2] = (uint8_t)(mac_lo >> 24); sc->link_params.mac_addr[3] = (uint8_t)(mac_lo >> 16); sc->link_params.mac_addr[4] = (uint8_t)(mac_lo >> 8); sc->link_params.mac_addr[5] = (uint8_t)(mac_lo); snprintf(sc->mac_addr_str, sizeof(sc->mac_addr_str), "%02x:%02x:%02x:%02x:%02x:%02x", sc->link_params.mac_addr[0], sc->link_params.mac_addr[1], sc->link_params.mac_addr[2], sc->link_params.mac_addr[3], sc->link_params.mac_addr[4], sc->link_params.mac_addr[5]); BLOGD(sc, DBG_LOAD, "Ethernet address: %s\n", sc->mac_addr_str); } #if 0 if (!IS_MF(sc) && ((sc->port.config & PORT_FEAT_CFG_STORAGE_PERSONALITY_MASK) == PORT_FEAT_CFG_STORAGE_PERSONALITY_FCOE)) { sc->flags |= BXE_NO_ISCSI; } if (!IS_MF(sc) && ((sc->port.config & PORT_FEAT_CFG_STORAGE_PERSONALITY_MASK) == PORT_FEAT_CFG_STORAGE_PERSONALITY_ISCSI)) { sc->flags |= BXE_NO_FCOE_FLAG; } #endif return (0); } static void bxe_get_tunable_params(struct bxe_softc *sc) { /* sanity checks */ if ((bxe_interrupt_mode != INTR_MODE_INTX) && (bxe_interrupt_mode != INTR_MODE_MSI) && (bxe_interrupt_mode != INTR_MODE_MSIX)) { BLOGW(sc, "invalid interrupt_mode value (%d)\n", bxe_interrupt_mode); bxe_interrupt_mode = INTR_MODE_MSIX; } if ((bxe_queue_count < 0) || (bxe_queue_count > MAX_RSS_CHAINS)) { BLOGW(sc, "invalid queue_count value (%d)\n", bxe_queue_count); bxe_queue_count = 0; } if ((bxe_max_rx_bufs < 1) || (bxe_max_rx_bufs > RX_BD_USABLE)) { if (bxe_max_rx_bufs == 0) { bxe_max_rx_bufs = RX_BD_USABLE; } else { BLOGW(sc, "invalid max_rx_bufs (%d)\n", bxe_max_rx_bufs); bxe_max_rx_bufs = 2048; } } if ((bxe_hc_rx_ticks < 1) || (bxe_hc_rx_ticks > 100)) { BLOGW(sc, "invalid hc_rx_ticks (%d)\n", bxe_hc_rx_ticks); bxe_hc_rx_ticks = 25; } if ((bxe_hc_tx_ticks < 1) || (bxe_hc_tx_ticks > 100)) { BLOGW(sc, "invalid hc_tx_ticks (%d)\n", bxe_hc_tx_ticks); bxe_hc_tx_ticks = 50; } if (bxe_max_aggregation_size == 0) { bxe_max_aggregation_size = TPA_AGG_SIZE; } if (bxe_max_aggregation_size > 0xffff) { BLOGW(sc, "invalid max_aggregation_size (%d)\n", bxe_max_aggregation_size); bxe_max_aggregation_size = TPA_AGG_SIZE; } if ((bxe_mrrs < -1) || (bxe_mrrs > 3)) { BLOGW(sc, "invalid mrrs (%d)\n", bxe_mrrs); bxe_mrrs = -1; } if ((bxe_autogreeen < 0) || (bxe_autogreeen > 2)) { BLOGW(sc, "invalid autogreeen (%d)\n", bxe_autogreeen); bxe_autogreeen = 0; } if ((bxe_udp_rss < 0) || (bxe_udp_rss > 1)) { BLOGW(sc, "invalid udp_rss (%d)\n", bxe_udp_rss); bxe_udp_rss = 0; } /* pull in user settings */ sc->interrupt_mode = bxe_interrupt_mode; sc->max_rx_bufs = bxe_max_rx_bufs; sc->hc_rx_ticks = bxe_hc_rx_ticks; sc->hc_tx_ticks = bxe_hc_tx_ticks; sc->max_aggregation_size = bxe_max_aggregation_size; sc->mrrs = bxe_mrrs; sc->autogreeen = bxe_autogreeen; sc->udp_rss = bxe_udp_rss; if (bxe_interrupt_mode == INTR_MODE_INTX) { sc->num_queues = 1; } else { /* INTR_MODE_MSI or INTR_MODE_MSIX */ sc->num_queues = min((bxe_queue_count ? bxe_queue_count : mp_ncpus), MAX_RSS_CHAINS); if (sc->num_queues > mp_ncpus) { sc->num_queues = mp_ncpus; } } BLOGD(sc, DBG_LOAD, "User Config: " "debug=0x%lx " "interrupt_mode=%d " "queue_count=%d " "hc_rx_ticks=%d " "hc_tx_ticks=%d " "rx_budget=%d " "max_aggregation_size=%d " "mrrs=%d " "autogreeen=%d " "udp_rss=%d\n", bxe_debug, sc->interrupt_mode, sc->num_queues, sc->hc_rx_ticks, sc->hc_tx_ticks, bxe_rx_budget, sc->max_aggregation_size, sc->mrrs, sc->autogreeen, sc->udp_rss); } static void bxe_media_detect(struct bxe_softc *sc) { uint32_t phy_idx = bxe_get_cur_phy_idx(sc); switch (sc->link_params.phy[phy_idx].media_type) { case ELINK_ETH_PHY_SFPP_10G_FIBER: case ELINK_ETH_PHY_XFP_FIBER: BLOGI(sc, "Found 10Gb Fiber media.\n"); sc->media = IFM_10G_SR; break; case ELINK_ETH_PHY_SFP_1G_FIBER: BLOGI(sc, "Found 1Gb Fiber media.\n"); sc->media = IFM_1000_SX; break; case ELINK_ETH_PHY_KR: case ELINK_ETH_PHY_CX4: BLOGI(sc, "Found 10GBase-CX4 media.\n"); sc->media = IFM_10G_CX4; break; case ELINK_ETH_PHY_DA_TWINAX: BLOGI(sc, "Found 10Gb Twinax media.\n"); sc->media = IFM_10G_TWINAX; break; case ELINK_ETH_PHY_BASE_T: if (sc->link_params.speed_cap_mask[0] & PORT_HW_CFG_SPEED_CAPABILITY_D0_10G) { BLOGI(sc, "Found 10GBase-T media.\n"); sc->media = IFM_10G_T; } else { BLOGI(sc, "Found 1000Base-T media.\n"); sc->media = IFM_1000_T; } break; case ELINK_ETH_PHY_NOT_PRESENT: BLOGI(sc, "Media not present.\n"); sc->media = 0; break; case ELINK_ETH_PHY_UNSPECIFIED: default: BLOGI(sc, "Unknown media!\n"); sc->media = 0; break; } } #define GET_FIELD(value, fname) \ (((value) & (fname##_MASK)) >> (fname##_SHIFT)) #define IGU_FID(val) GET_FIELD((val), IGU_REG_MAPPING_MEMORY_FID) #define IGU_VEC(val) GET_FIELD((val), IGU_REG_MAPPING_MEMORY_VECTOR) static int bxe_get_igu_cam_info(struct bxe_softc *sc) { int pfid = SC_FUNC(sc); int igu_sb_id; uint32_t val; uint8_t fid, igu_sb_cnt = 0; sc->igu_base_sb = 0xff; if (CHIP_INT_MODE_IS_BC(sc)) { int vn = SC_VN(sc); igu_sb_cnt = sc->igu_sb_cnt; sc->igu_base_sb = ((CHIP_IS_MODE_4_PORT(sc) ? pfid : vn) * FP_SB_MAX_E1x); sc->igu_dsb_id = (E1HVN_MAX * FP_SB_MAX_E1x + (CHIP_IS_MODE_4_PORT(sc) ? pfid : vn)); return (0); } /* IGU in normal mode - read CAM */ for (igu_sb_id = 0; igu_sb_id < IGU_REG_MAPPING_MEMORY_SIZE; igu_sb_id++) { val = REG_RD(sc, IGU_REG_MAPPING_MEMORY + igu_sb_id * 4); if (!(val & IGU_REG_MAPPING_MEMORY_VALID)) { continue; } fid = IGU_FID(val); if ((fid & IGU_FID_ENCODE_IS_PF)) { if ((fid & IGU_FID_PF_NUM_MASK) != pfid) { continue; } if (IGU_VEC(val) == 0) { /* default status block */ sc->igu_dsb_id = igu_sb_id; } else { if (sc->igu_base_sb == 0xff) { sc->igu_base_sb = igu_sb_id; } igu_sb_cnt++; } } } /* * Due to new PF resource allocation by MFW T7.4 and above, it's optional * that number of CAM entries will not be equal to the value advertised in * PCI. Driver should use the minimal value of both as the actual status * block count */ sc->igu_sb_cnt = min(sc->igu_sb_cnt, igu_sb_cnt); if (igu_sb_cnt == 0) { BLOGE(sc, "CAM configuration error\n"); return (-1); } return (0); } /* * Gather various information from the device config space, the device itself, * shmem, and the user input. */ static int bxe_get_device_info(struct bxe_softc *sc) { uint32_t val; int rc; /* Get the data for the device */ sc->devinfo.vendor_id = pci_get_vendor(sc->dev); sc->devinfo.device_id = pci_get_device(sc->dev); sc->devinfo.subvendor_id = pci_get_subvendor(sc->dev); sc->devinfo.subdevice_id = pci_get_subdevice(sc->dev); /* get the chip revision (chip metal comes from pci config space) */ sc->devinfo.chip_id = sc->link_params.chip_id = (((REG_RD(sc, MISC_REG_CHIP_NUM) & 0xffff) << 16) | ((REG_RD(sc, MISC_REG_CHIP_REV) & 0xf) << 12) | (((REG_RD(sc, PCICFG_OFFSET + PCI_ID_VAL3) >> 24) & 0xf) << 4) | ((REG_RD(sc, MISC_REG_BOND_ID) & 0xf) << 0)); /* force 57811 according to MISC register */ if (REG_RD(sc, MISC_REG_CHIP_TYPE) & MISC_REG_CHIP_TYPE_57811_MASK) { if (CHIP_IS_57810(sc)) { sc->devinfo.chip_id = ((CHIP_NUM_57811 << 16) | (sc->devinfo.chip_id & 0x0000ffff)); } else if (CHIP_IS_57810_MF(sc)) { sc->devinfo.chip_id = ((CHIP_NUM_57811_MF << 16) | (sc->devinfo.chip_id & 0x0000ffff)); } sc->devinfo.chip_id |= 0x1; } BLOGD(sc, DBG_LOAD, "chip_id=0x%08x (num=0x%04x rev=0x%01x metal=0x%02x bond=0x%01x)\n", sc->devinfo.chip_id, ((sc->devinfo.chip_id >> 16) & 0xffff), ((sc->devinfo.chip_id >> 12) & 0xf), ((sc->devinfo.chip_id >> 4) & 0xff), ((sc->devinfo.chip_id >> 0) & 0xf)); val = (REG_RD(sc, 0x2874) & 0x55); if ((sc->devinfo.chip_id & 0x1) || (CHIP_IS_E1(sc) && val) || (CHIP_IS_E1H(sc) && (val == 0x55))) { sc->flags |= BXE_ONE_PORT_FLAG; BLOGD(sc, DBG_LOAD, "single port device\n"); } /* set the doorbell size */ sc->doorbell_size = (1 << BXE_DB_SHIFT); /* determine whether the device is in 2 port or 4 port mode */ sc->devinfo.chip_port_mode = CHIP_PORT_MODE_NONE; /* E1 & E1h*/ if (CHIP_IS_E2E3(sc)) { /* * Read port4mode_en_ovwr[0]: * If 1, four port mode is in port4mode_en_ovwr[1]. * If 0, four port mode is in port4mode_en[0]. */ val = REG_RD(sc, MISC_REG_PORT4MODE_EN_OVWR); if (val & 1) { val = ((val >> 1) & 1); } else { val = REG_RD(sc, MISC_REG_PORT4MODE_EN); } sc->devinfo.chip_port_mode = (val) ? CHIP_4_PORT_MODE : CHIP_2_PORT_MODE; BLOGD(sc, DBG_LOAD, "Port mode = %s\n", (val) ? "4" : "2"); } /* get the function and path info for the device */ bxe_get_function_num(sc); /* get the shared memory base address */ sc->devinfo.shmem_base = sc->link_params.shmem_base = REG_RD(sc, MISC_REG_SHARED_MEM_ADDR); sc->devinfo.shmem2_base = REG_RD(sc, (SC_PATH(sc) ? MISC_REG_GENERIC_CR_1 : MISC_REG_GENERIC_CR_0)); BLOGD(sc, DBG_LOAD, "shmem_base=0x%08x, shmem2_base=0x%08x\n", sc->devinfo.shmem_base, sc->devinfo.shmem2_base); if (!sc->devinfo.shmem_base) { /* this should ONLY prevent upcoming shmem reads */ BLOGI(sc, "MCP not active\n"); sc->flags |= BXE_NO_MCP_FLAG; return (0); } /* make sure the shared memory contents are valid */ val = SHMEM_RD(sc, validity_map[SC_PORT(sc)]); if ((val & (SHR_MEM_VALIDITY_DEV_INFO | SHR_MEM_VALIDITY_MB)) != (SHR_MEM_VALIDITY_DEV_INFO | SHR_MEM_VALIDITY_MB)) { BLOGE(sc, "Invalid SHMEM validity signature: 0x%08x\n", val); return (0); } BLOGD(sc, DBG_LOAD, "Valid SHMEM validity signature: 0x%08x\n", val); /* get the bootcode version */ sc->devinfo.bc_ver = SHMEM_RD(sc, dev_info.bc_rev); snprintf(sc->devinfo.bc_ver_str, sizeof(sc->devinfo.bc_ver_str), "%d.%d.%d", ((sc->devinfo.bc_ver >> 24) & 0xff), ((sc->devinfo.bc_ver >> 16) & 0xff), ((sc->devinfo.bc_ver >> 8) & 0xff)); BLOGD(sc, DBG_LOAD, "Bootcode version: %s\n", sc->devinfo.bc_ver_str); /* get the bootcode shmem address */ sc->devinfo.mf_cfg_base = bxe_get_shmem_mf_cfg_base(sc); BLOGD(sc, DBG_LOAD, "mf_cfg_base=0x08%x \n", sc->devinfo.mf_cfg_base); /* clean indirect addresses as they're not used */ pci_write_config(sc->dev, PCICFG_GRC_ADDRESS, 0, 4); if (IS_PF(sc)) { REG_WR(sc, PXP2_REG_PGL_ADDR_88_F0, 0); REG_WR(sc, PXP2_REG_PGL_ADDR_8C_F0, 0); REG_WR(sc, PXP2_REG_PGL_ADDR_90_F0, 0); REG_WR(sc, PXP2_REG_PGL_ADDR_94_F0, 0); if (CHIP_IS_E1x(sc)) { REG_WR(sc, PXP2_REG_PGL_ADDR_88_F1, 0); REG_WR(sc, PXP2_REG_PGL_ADDR_8C_F1, 0); REG_WR(sc, PXP2_REG_PGL_ADDR_90_F1, 0); REG_WR(sc, PXP2_REG_PGL_ADDR_94_F1, 0); } /* * Enable internal target-read (in case we are probed after PF * FLR). Must be done prior to any BAR read access. Only for * 57712 and up */ if (!CHIP_IS_E1x(sc)) { REG_WR(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_TARGET_READ, 1); } } /* get the nvram size */ val = REG_RD(sc, MCP_REG_MCPR_NVM_CFG4); sc->devinfo.flash_size = (NVRAM_1MB_SIZE << (val & MCPR_NVM_CFG4_FLASH_SIZE)); BLOGD(sc, DBG_LOAD, "nvram flash size: %d\n", sc->devinfo.flash_size); /* get PCI capabilites */ bxe_probe_pci_caps(sc); bxe_set_power_state(sc, PCI_PM_D0); /* get various configuration parameters from shmem */ bxe_get_shmem_info(sc); if (sc->devinfo.pcie_msix_cap_reg != 0) { val = pci_read_config(sc->dev, (sc->devinfo.pcie_msix_cap_reg + PCIR_MSIX_CTRL), 2); sc->igu_sb_cnt = (val & PCIM_MSIXCTRL_TABLE_SIZE); } else { sc->igu_sb_cnt = 1; } sc->igu_base_addr = BAR_IGU_INTMEM; /* initialize IGU parameters */ if (CHIP_IS_E1x(sc)) { sc->devinfo.int_block = INT_BLOCK_HC; sc->igu_dsb_id = DEF_SB_IGU_ID; sc->igu_base_sb = 0; } else { sc->devinfo.int_block = INT_BLOCK_IGU; /* do not allow device reset during IGU info preocessing */ bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RESET); val = REG_RD(sc, IGU_REG_BLOCK_CONFIGURATION); if (val & IGU_BLOCK_CONFIGURATION_REG_BACKWARD_COMP_EN) { int tout = 5000; BLOGD(sc, DBG_LOAD, "FORCING IGU Normal Mode\n"); val &= ~(IGU_BLOCK_CONFIGURATION_REG_BACKWARD_COMP_EN); REG_WR(sc, IGU_REG_BLOCK_CONFIGURATION, val); REG_WR(sc, IGU_REG_RESET_MEMORIES, 0x7f); while (tout && REG_RD(sc, IGU_REG_RESET_MEMORIES)) { tout--; DELAY(1000); } if (REG_RD(sc, IGU_REG_RESET_MEMORIES)) { BLOGD(sc, DBG_LOAD, "FORCING IGU Normal Mode failed!!!\n"); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RESET); return (-1); } } if (val & IGU_BLOCK_CONFIGURATION_REG_BACKWARD_COMP_EN) { BLOGD(sc, DBG_LOAD, "IGU Backward Compatible Mode\n"); sc->devinfo.int_block |= INT_BLOCK_MODE_BW_COMP; } else { BLOGD(sc, DBG_LOAD, "IGU Normal Mode\n"); } rc = bxe_get_igu_cam_info(sc); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RESET); if (rc) { return (rc); } } /* * Get base FW non-default (fast path) status block ID. This value is * used to initialize the fw_sb_id saved on the fp/queue structure to * determine the id used by the FW. */ if (CHIP_IS_E1x(sc)) { sc->base_fw_ndsb = ((SC_PORT(sc) * FP_SB_MAX_E1x) + SC_L_ID(sc)); } else { /* * 57712+ - We currently use one FW SB per IGU SB (Rx and Tx of * the same queue are indicated on the same IGU SB). So we prefer * FW and IGU SBs to be the same value. */ sc->base_fw_ndsb = sc->igu_base_sb; } BLOGD(sc, DBG_LOAD, "igu_dsb_id=%d igu_base_sb=%d igu_sb_cnt=%d base_fw_ndsb=%d\n", sc->igu_dsb_id, sc->igu_base_sb, sc->igu_sb_cnt, sc->base_fw_ndsb); elink_phy_probe(&sc->link_params); return (0); } static void bxe_link_settings_supported(struct bxe_softc *sc, uint32_t switch_cfg) { uint32_t cfg_size = 0; uint32_t idx; uint8_t port = SC_PORT(sc); /* aggregation of supported attributes of all external phys */ sc->port.supported[0] = 0; sc->port.supported[1] = 0; switch (sc->link_params.num_phys) { case 1: sc->port.supported[0] = sc->link_params.phy[ELINK_INT_PHY].supported; cfg_size = 1; break; case 2: sc->port.supported[0] = sc->link_params.phy[ELINK_EXT_PHY1].supported; cfg_size = 1; break; case 3: if (sc->link_params.multi_phy_config & PORT_HW_CFG_PHY_SWAPPED_ENABLED) { sc->port.supported[1] = sc->link_params.phy[ELINK_EXT_PHY1].supported; sc->port.supported[0] = sc->link_params.phy[ELINK_EXT_PHY2].supported; } else { sc->port.supported[0] = sc->link_params.phy[ELINK_EXT_PHY1].supported; sc->port.supported[1] = sc->link_params.phy[ELINK_EXT_PHY2].supported; } cfg_size = 2; break; } if (!(sc->port.supported[0] || sc->port.supported[1])) { BLOGE(sc, "Invalid phy config in NVRAM (PHY1=0x%08x PHY2=0x%08x)\n", SHMEM_RD(sc, dev_info.port_hw_config[port].external_phy_config), SHMEM_RD(sc, dev_info.port_hw_config[port].external_phy_config2)); return; } if (CHIP_IS_E3(sc)) sc->port.phy_addr = REG_RD(sc, MISC_REG_WC0_CTRL_PHY_ADDR); else { switch (switch_cfg) { case ELINK_SWITCH_CFG_1G: sc->port.phy_addr = REG_RD(sc, NIG_REG_SERDES0_CTRL_PHY_ADDR + port*0x10); break; case ELINK_SWITCH_CFG_10G: sc->port.phy_addr = REG_RD(sc, NIG_REG_XGXS0_CTRL_PHY_ADDR + port*0x18); break; default: BLOGE(sc, "Invalid switch config in link_config=0x%08x\n", sc->port.link_config[0]); return; } } BLOGD(sc, DBG_LOAD, "PHY addr 0x%08x\n", sc->port.phy_addr); /* mask what we support according to speed_cap_mask per configuration */ for (idx = 0; idx < cfg_size; idx++) { if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_10M_HALF)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_10baseT_Half; } if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_10M_FULL)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_10baseT_Full; } if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_100M_HALF)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_100baseT_Half; } if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_100M_FULL)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_100baseT_Full; } if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_1G)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_1000baseT_Full; } if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_2_5G)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_2500baseX_Full; } if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_10G)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_10000baseT_Full; } if (!(sc->link_params.speed_cap_mask[idx] & PORT_HW_CFG_SPEED_CAPABILITY_D0_20G)) { sc->port.supported[idx] &= ~ELINK_SUPPORTED_20000baseKR2_Full; } } BLOGD(sc, DBG_LOAD, "PHY supported 0=0x%08x 1=0x%08x\n", sc->port.supported[0], sc->port.supported[1]); } static void bxe_link_settings_requested(struct bxe_softc *sc) { uint32_t link_config; uint32_t idx; uint32_t cfg_size = 0; sc->port.advertising[0] = 0; sc->port.advertising[1] = 0; switch (sc->link_params.num_phys) { case 1: case 2: cfg_size = 1; break; case 3: cfg_size = 2; break; } for (idx = 0; idx < cfg_size; idx++) { sc->link_params.req_duplex[idx] = DUPLEX_FULL; link_config = sc->port.link_config[idx]; switch (link_config & PORT_FEATURE_LINK_SPEED_MASK) { case PORT_FEATURE_LINK_SPEED_AUTO: if (sc->port.supported[idx] & ELINK_SUPPORTED_Autoneg) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_AUTO_NEG; sc->port.advertising[idx] |= sc->port.supported[idx]; if (sc->link_params.phy[ELINK_EXT_PHY1].type == PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84833) sc->port.advertising[idx] |= (ELINK_SUPPORTED_100baseT_Half | ELINK_SUPPORTED_100baseT_Full); } else { /* force 10G, no AN */ sc->link_params.req_line_speed[idx] = ELINK_SPEED_10000; sc->port.advertising[idx] |= (ADVERTISED_10000baseT_Full | ADVERTISED_FIBRE); continue; } break; case PORT_FEATURE_LINK_SPEED_10M_FULL: if (sc->port.supported[idx] & ELINK_SUPPORTED_10baseT_Full) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_10; sc->port.advertising[idx] |= (ADVERTISED_10baseT_Full | ADVERTISED_TP); } else { BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); return; } break; case PORT_FEATURE_LINK_SPEED_10M_HALF: if (sc->port.supported[idx] & ELINK_SUPPORTED_10baseT_Half) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_10; sc->link_params.req_duplex[idx] = DUPLEX_HALF; sc->port.advertising[idx] |= (ADVERTISED_10baseT_Half | ADVERTISED_TP); } else { BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); return; } break; case PORT_FEATURE_LINK_SPEED_100M_FULL: if (sc->port.supported[idx] & ELINK_SUPPORTED_100baseT_Full) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_100; sc->port.advertising[idx] |= (ADVERTISED_100baseT_Full | ADVERTISED_TP); } else { BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); return; } break; case PORT_FEATURE_LINK_SPEED_100M_HALF: if (sc->port.supported[idx] & ELINK_SUPPORTED_100baseT_Half) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_100; sc->link_params.req_duplex[idx] = DUPLEX_HALF; sc->port.advertising[idx] |= (ADVERTISED_100baseT_Half | ADVERTISED_TP); } else { BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); return; } break; case PORT_FEATURE_LINK_SPEED_1G: if (sc->port.supported[idx] & ELINK_SUPPORTED_1000baseT_Full) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_1000; sc->port.advertising[idx] |= (ADVERTISED_1000baseT_Full | ADVERTISED_TP); } else { BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); return; } break; case PORT_FEATURE_LINK_SPEED_2_5G: if (sc->port.supported[idx] & ELINK_SUPPORTED_2500baseX_Full) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_2500; sc->port.advertising[idx] |= (ADVERTISED_2500baseX_Full | ADVERTISED_TP); } else { BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); return; } break; case PORT_FEATURE_LINK_SPEED_10G_CX4: if (sc->port.supported[idx] & ELINK_SUPPORTED_10000baseT_Full) { sc->link_params.req_line_speed[idx] = ELINK_SPEED_10000; sc->port.advertising[idx] |= (ADVERTISED_10000baseT_Full | ADVERTISED_FIBRE); } else { BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); return; } break; case PORT_FEATURE_LINK_SPEED_20G: sc->link_params.req_line_speed[idx] = ELINK_SPEED_20000; break; default: BLOGE(sc, "Invalid NVRAM config link_config=0x%08x " "speed_cap_mask=0x%08x\n", link_config, sc->link_params.speed_cap_mask[idx]); sc->link_params.req_line_speed[idx] = ELINK_SPEED_AUTO_NEG; sc->port.advertising[idx] = sc->port.supported[idx]; break; } sc->link_params.req_flow_ctrl[idx] = (link_config & PORT_FEATURE_FLOW_CONTROL_MASK); if (sc->link_params.req_flow_ctrl[idx] == ELINK_FLOW_CTRL_AUTO) { if (!(sc->port.supported[idx] & ELINK_SUPPORTED_Autoneg)) { sc->link_params.req_flow_ctrl[idx] = ELINK_FLOW_CTRL_NONE; } else { bxe_set_requested_fc(sc); } } BLOGD(sc, DBG_LOAD, "req_line_speed=%d req_duplex=%d " "req_flow_ctrl=0x%x advertising=0x%x\n", sc->link_params.req_line_speed[idx], sc->link_params.req_duplex[idx], sc->link_params.req_flow_ctrl[idx], sc->port.advertising[idx]); } } static void bxe_get_phy_info(struct bxe_softc *sc) { uint8_t port = SC_PORT(sc); uint32_t config = sc->port.config; uint32_t eee_mode; /* shmem data already read in bxe_get_shmem_info() */ BLOGD(sc, DBG_LOAD, "lane_config=0x%08x speed_cap_mask0=0x%08x " "link_config0=0x%08x\n", sc->link_params.lane_config, sc->link_params.speed_cap_mask[0], sc->port.link_config[0]); bxe_link_settings_supported(sc, sc->link_params.switch_cfg); bxe_link_settings_requested(sc); if (sc->autogreeen == AUTO_GREEN_FORCE_ON) { sc->link_params.feature_config_flags |= ELINK_FEATURE_CONFIG_AUTOGREEEN_ENABLED; } else if (sc->autogreeen == AUTO_GREEN_FORCE_OFF) { sc->link_params.feature_config_flags &= ~ELINK_FEATURE_CONFIG_AUTOGREEEN_ENABLED; } else if (config & PORT_FEAT_CFG_AUTOGREEEN_ENABLED) { sc->link_params.feature_config_flags |= ELINK_FEATURE_CONFIG_AUTOGREEEN_ENABLED; } /* configure link feature according to nvram value */ eee_mode = (((SHMEM_RD(sc, dev_info.port_feature_config[port].eee_power_mode)) & PORT_FEAT_CFG_EEE_POWER_MODE_MASK) >> PORT_FEAT_CFG_EEE_POWER_MODE_SHIFT); if (eee_mode != PORT_FEAT_CFG_EEE_POWER_MODE_DISABLED) { sc->link_params.eee_mode = (ELINK_EEE_MODE_ADV_LPI | ELINK_EEE_MODE_ENABLE_LPI | ELINK_EEE_MODE_OUTPUT_TIME); } else { sc->link_params.eee_mode = 0; } /* get the media type */ bxe_media_detect(sc); } static void bxe_get_params(struct bxe_softc *sc) { /* get user tunable params */ bxe_get_tunable_params(sc); /* select the RX and TX ring sizes */ sc->tx_ring_size = TX_BD_USABLE; sc->rx_ring_size = RX_BD_USABLE; /* XXX disable WoL */ sc->wol = 0; } static void bxe_set_modes_bitmap(struct bxe_softc *sc) { uint32_t flags = 0; if (CHIP_REV_IS_FPGA(sc)) { SET_FLAGS(flags, MODE_FPGA); } else if (CHIP_REV_IS_EMUL(sc)) { SET_FLAGS(flags, MODE_EMUL); } else { SET_FLAGS(flags, MODE_ASIC); } if (CHIP_IS_MODE_4_PORT(sc)) { SET_FLAGS(flags, MODE_PORT4); } else { SET_FLAGS(flags, MODE_PORT2); } if (CHIP_IS_E2(sc)) { SET_FLAGS(flags, MODE_E2); } else if (CHIP_IS_E3(sc)) { SET_FLAGS(flags, MODE_E3); if (CHIP_REV(sc) == CHIP_REV_Ax) { SET_FLAGS(flags, MODE_E3_A0); } else /*if (CHIP_REV(sc) == CHIP_REV_Bx)*/ { SET_FLAGS(flags, MODE_E3_B0 | MODE_COS3); } } if (IS_MF(sc)) { SET_FLAGS(flags, MODE_MF); switch (sc->devinfo.mf_info.mf_mode) { case MULTI_FUNCTION_SD: SET_FLAGS(flags, MODE_MF_SD); break; case MULTI_FUNCTION_SI: SET_FLAGS(flags, MODE_MF_SI); break; case MULTI_FUNCTION_AFEX: SET_FLAGS(flags, MODE_MF_AFEX); break; } } else { SET_FLAGS(flags, MODE_SF); } #if defined(__LITTLE_ENDIAN) SET_FLAGS(flags, MODE_LITTLE_ENDIAN); #else /* __BIG_ENDIAN */ SET_FLAGS(flags, MODE_BIG_ENDIAN); #endif INIT_MODE_FLAGS(sc) = flags; } static int bxe_alloc_hsi_mem(struct bxe_softc *sc) { struct bxe_fastpath *fp; bus_addr_t busaddr; int max_agg_queues; int max_segments; bus_size_t max_size; bus_size_t max_seg_size; char buf[32]; int rc; int i, j; /* XXX zero out all vars here and call bxe_alloc_hsi_mem on error */ /* allocate the parent bus DMA tag */ rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), /* parent tag */ 1, /* alignment */ 0, /* boundary limit */ BUS_SPACE_MAXADDR, /* restricted low */ BUS_SPACE_MAXADDR, /* restricted hi */ NULL, /* addr filter() */ NULL, /* addr filter() arg */ BUS_SPACE_MAXSIZE_32BIT, /* max map size */ BUS_SPACE_UNRESTRICTED, /* num discontinuous */ BUS_SPACE_MAXSIZE_32BIT, /* max seg size */ 0, /* flags */ NULL, /* lock() */ NULL, /* lock() arg */ &sc->parent_dma_tag); /* returned dma tag */ if (rc != 0) { BLOGE(sc, "Failed to alloc parent DMA tag (%d)!\n", rc); return (1); } /************************/ /* DEFAULT STATUS BLOCK */ /************************/ if (bxe_dma_alloc(sc, sizeof(struct host_sp_status_block), &sc->def_sb_dma, "default status block") != 0) { /* XXX */ bus_dma_tag_destroy(sc->parent_dma_tag); return (1); } sc->def_sb = (struct host_sp_status_block *)sc->def_sb_dma.vaddr; /***************/ /* EVENT QUEUE */ /***************/ if (bxe_dma_alloc(sc, BCM_PAGE_SIZE, &sc->eq_dma, "event queue") != 0) { /* XXX */ bxe_dma_free(sc, &sc->def_sb_dma); sc->def_sb = NULL; bus_dma_tag_destroy(sc->parent_dma_tag); return (1); } sc->eq = (union event_ring_elem * )sc->eq_dma.vaddr; /*************/ /* SLOW PATH */ /*************/ if (bxe_dma_alloc(sc, sizeof(struct bxe_slowpath), &sc->sp_dma, "slow path") != 0) { /* XXX */ bxe_dma_free(sc, &sc->eq_dma); sc->eq = NULL; bxe_dma_free(sc, &sc->def_sb_dma); sc->def_sb = NULL; bus_dma_tag_destroy(sc->parent_dma_tag); return (1); } sc->sp = (struct bxe_slowpath *)sc->sp_dma.vaddr; /*******************/ /* SLOW PATH QUEUE */ /*******************/ if (bxe_dma_alloc(sc, BCM_PAGE_SIZE, &sc->spq_dma, "slow path queue") != 0) { /* XXX */ bxe_dma_free(sc, &sc->sp_dma); sc->sp = NULL; bxe_dma_free(sc, &sc->eq_dma); sc->eq = NULL; bxe_dma_free(sc, &sc->def_sb_dma); sc->def_sb = NULL; bus_dma_tag_destroy(sc->parent_dma_tag); return (1); } sc->spq = (struct eth_spe *)sc->spq_dma.vaddr; /***************************/ /* FW DECOMPRESSION BUFFER */ /***************************/ if (bxe_dma_alloc(sc, FW_BUF_SIZE, &sc->gz_buf_dma, "fw decompression buffer") != 0) { /* XXX */ bxe_dma_free(sc, &sc->spq_dma); sc->spq = NULL; bxe_dma_free(sc, &sc->sp_dma); sc->sp = NULL; bxe_dma_free(sc, &sc->eq_dma); sc->eq = NULL; bxe_dma_free(sc, &sc->def_sb_dma); sc->def_sb = NULL; bus_dma_tag_destroy(sc->parent_dma_tag); return (1); } sc->gz_buf = (void *)sc->gz_buf_dma.vaddr; if ((sc->gz_strm = malloc(sizeof(*sc->gz_strm), M_DEVBUF, M_NOWAIT)) == NULL) { /* XXX */ bxe_dma_free(sc, &sc->gz_buf_dma); sc->gz_buf = NULL; bxe_dma_free(sc, &sc->spq_dma); sc->spq = NULL; bxe_dma_free(sc, &sc->sp_dma); sc->sp = NULL; bxe_dma_free(sc, &sc->eq_dma); sc->eq = NULL; bxe_dma_free(sc, &sc->def_sb_dma); sc->def_sb = NULL; bus_dma_tag_destroy(sc->parent_dma_tag); return (1); } /*************/ /* FASTPATHS */ /*************/ /* allocate DMA memory for each fastpath structure */ for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; fp->sc = sc; fp->index = i; /*******************/ /* FP STATUS BLOCK */ /*******************/ snprintf(buf, sizeof(buf), "fp %d status block", i); if (bxe_dma_alloc(sc, sizeof(union bxe_host_hc_status_block), &fp->sb_dma, buf) != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to alloc %s\n", buf); return (1); } else { if (CHIP_IS_E2E3(sc)) { fp->status_block.e2_sb = (struct host_hc_status_block_e2 *)fp->sb_dma.vaddr; } else { fp->status_block.e1x_sb = (struct host_hc_status_block_e1x *)fp->sb_dma.vaddr; } } /******************/ /* FP TX BD CHAIN */ /******************/ snprintf(buf, sizeof(buf), "fp %d tx bd chain", i); if (bxe_dma_alloc(sc, (BCM_PAGE_SIZE * TX_BD_NUM_PAGES), &fp->tx_dma, buf) != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to alloc %s\n", buf); return (1); } else { fp->tx_chain = (union eth_tx_bd_types *)fp->tx_dma.vaddr; } /* link together the tx bd chain pages */ for (j = 1; j <= TX_BD_NUM_PAGES; j++) { /* index into the tx bd chain array to last entry per page */ struct eth_tx_next_bd *tx_next_bd = &fp->tx_chain[TX_BD_TOTAL_PER_PAGE * j - 1].next_bd; /* point to the next page and wrap from last page */ busaddr = (fp->tx_dma.paddr + (BCM_PAGE_SIZE * (j % TX_BD_NUM_PAGES))); tx_next_bd->addr_hi = htole32(U64_HI(busaddr)); tx_next_bd->addr_lo = htole32(U64_LO(busaddr)); } /******************/ /* FP RX BD CHAIN */ /******************/ snprintf(buf, sizeof(buf), "fp %d rx bd chain", i); if (bxe_dma_alloc(sc, (BCM_PAGE_SIZE * RX_BD_NUM_PAGES), &fp->rx_dma, buf) != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to alloc %s\n", buf); return (1); } else { fp->rx_chain = (struct eth_rx_bd *)fp->rx_dma.vaddr; } /* link together the rx bd chain pages */ for (j = 1; j <= RX_BD_NUM_PAGES; j++) { /* index into the rx bd chain array to last entry per page */ struct eth_rx_bd *rx_bd = &fp->rx_chain[RX_BD_TOTAL_PER_PAGE * j - 2]; /* point to the next page and wrap from last page */ busaddr = (fp->rx_dma.paddr + (BCM_PAGE_SIZE * (j % RX_BD_NUM_PAGES))); rx_bd->addr_hi = htole32(U64_HI(busaddr)); rx_bd->addr_lo = htole32(U64_LO(busaddr)); } /*******************/ /* FP RX RCQ CHAIN */ /*******************/ snprintf(buf, sizeof(buf), "fp %d rcq chain", i); if (bxe_dma_alloc(sc, (BCM_PAGE_SIZE * RCQ_NUM_PAGES), &fp->rcq_dma, buf) != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to alloc %s\n", buf); return (1); } else { fp->rcq_chain = (union eth_rx_cqe *)fp->rcq_dma.vaddr; } /* link together the rcq chain pages */ for (j = 1; j <= RCQ_NUM_PAGES; j++) { /* index into the rcq chain array to last entry per page */ struct eth_rx_cqe_next_page *rx_cqe_next = (struct eth_rx_cqe_next_page *) &fp->rcq_chain[RCQ_TOTAL_PER_PAGE * j - 1]; /* point to the next page and wrap from last page */ busaddr = (fp->rcq_dma.paddr + (BCM_PAGE_SIZE * (j % RCQ_NUM_PAGES))); rx_cqe_next->addr_hi = htole32(U64_HI(busaddr)); rx_cqe_next->addr_lo = htole32(U64_LO(busaddr)); } /*******************/ /* FP RX SGE CHAIN */ /*******************/ snprintf(buf, sizeof(buf), "fp %d sge chain", i); if (bxe_dma_alloc(sc, (BCM_PAGE_SIZE * RX_SGE_NUM_PAGES), &fp->rx_sge_dma, buf) != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to alloc %s\n", buf); return (1); } else { fp->rx_sge_chain = (struct eth_rx_sge *)fp->rx_sge_dma.vaddr; } /* link together the sge chain pages */ for (j = 1; j <= RX_SGE_NUM_PAGES; j++) { /* index into the rcq chain array to last entry per page */ struct eth_rx_sge *rx_sge = &fp->rx_sge_chain[RX_SGE_TOTAL_PER_PAGE * j - 2]; /* point to the next page and wrap from last page */ busaddr = (fp->rx_sge_dma.paddr + (BCM_PAGE_SIZE * (j % RX_SGE_NUM_PAGES))); rx_sge->addr_hi = htole32(U64_HI(busaddr)); rx_sge->addr_lo = htole32(U64_LO(busaddr)); } /***********************/ /* FP TX MBUF DMA MAPS */ /***********************/ /* set required sizes before mapping to conserve resources */ if (if_getcapenable(sc->ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) { max_size = BXE_TSO_MAX_SIZE; max_segments = BXE_TSO_MAX_SEGMENTS; max_seg_size = BXE_TSO_MAX_SEG_SIZE; } else { max_size = (MCLBYTES * BXE_MAX_SEGMENTS); max_segments = BXE_MAX_SEGMENTS; max_seg_size = MCLBYTES; } /* create a dma tag for the tx mbufs */ rc = bus_dma_tag_create(sc->parent_dma_tag, /* parent tag */ 1, /* alignment */ 0, /* boundary limit */ BUS_SPACE_MAXADDR, /* restricted low */ BUS_SPACE_MAXADDR, /* restricted hi */ NULL, /* addr filter() */ NULL, /* addr filter() arg */ max_size, /* max map size */ max_segments, /* num discontinuous */ max_seg_size, /* max seg size */ 0, /* flags */ NULL, /* lock() */ NULL, /* lock() arg */ &fp->tx_mbuf_tag); /* returned dma tag */ if (rc != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma tag for " "'fp %d tx mbufs' (%d)\n", i, rc); return (1); } /* create dma maps for each of the tx mbuf clusters */ for (j = 0; j < TX_BD_TOTAL; j++) { if (bus_dmamap_create(fp->tx_mbuf_tag, BUS_DMA_NOWAIT, &fp->tx_mbuf_chain[j].m_map)) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma map for " "'fp %d tx mbuf %d' (%d)\n", i, j, rc); return (1); } } /***********************/ /* FP RX MBUF DMA MAPS */ /***********************/ /* create a dma tag for the rx mbufs */ rc = bus_dma_tag_create(sc->parent_dma_tag, /* parent tag */ 1, /* alignment */ 0, /* boundary limit */ BUS_SPACE_MAXADDR, /* restricted low */ BUS_SPACE_MAXADDR, /* restricted hi */ NULL, /* addr filter() */ NULL, /* addr filter() arg */ MJUM9BYTES, /* max map size */ 1, /* num discontinuous */ MJUM9BYTES, /* max seg size */ 0, /* flags */ NULL, /* lock() */ NULL, /* lock() arg */ &fp->rx_mbuf_tag); /* returned dma tag */ if (rc != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma tag for " "'fp %d rx mbufs' (%d)\n", i, rc); return (1); } /* create dma maps for each of the rx mbuf clusters */ for (j = 0; j < RX_BD_TOTAL; j++) { if (bus_dmamap_create(fp->rx_mbuf_tag, BUS_DMA_NOWAIT, &fp->rx_mbuf_chain[j].m_map)) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma map for " "'fp %d rx mbuf %d' (%d)\n", i, j, rc); return (1); } } /* create dma map for the spare rx mbuf cluster */ if (bus_dmamap_create(fp->rx_mbuf_tag, BUS_DMA_NOWAIT, &fp->rx_mbuf_spare_map)) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma map for " "'fp %d spare rx mbuf' (%d)\n", i, rc); return (1); } /***************************/ /* FP RX SGE MBUF DMA MAPS */ /***************************/ /* create a dma tag for the rx sge mbufs */ rc = bus_dma_tag_create(sc->parent_dma_tag, /* parent tag */ 1, /* alignment */ 0, /* boundary limit */ BUS_SPACE_MAXADDR, /* restricted low */ BUS_SPACE_MAXADDR, /* restricted hi */ NULL, /* addr filter() */ NULL, /* addr filter() arg */ BCM_PAGE_SIZE, /* max map size */ 1, /* num discontinuous */ BCM_PAGE_SIZE, /* max seg size */ 0, /* flags */ NULL, /* lock() */ NULL, /* lock() arg */ &fp->rx_sge_mbuf_tag); /* returned dma tag */ if (rc != 0) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma tag for " "'fp %d rx sge mbufs' (%d)\n", i, rc); return (1); } /* create dma maps for the rx sge mbuf clusters */ for (j = 0; j < RX_SGE_TOTAL; j++) { if (bus_dmamap_create(fp->rx_sge_mbuf_tag, BUS_DMA_NOWAIT, &fp->rx_sge_mbuf_chain[j].m_map)) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma map for " "'fp %d rx sge mbuf %d' (%d)\n", i, j, rc); return (1); } } /* create dma map for the spare rx sge mbuf cluster */ if (bus_dmamap_create(fp->rx_sge_mbuf_tag, BUS_DMA_NOWAIT, &fp->rx_sge_mbuf_spare_map)) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma map for " "'fp %d spare rx sge mbuf' (%d)\n", i, rc); return (1); } /***************************/ /* FP RX TPA MBUF DMA MAPS */ /***************************/ /* create dma maps for the rx tpa mbuf clusters */ max_agg_queues = MAX_AGG_QS(sc); for (j = 0; j < max_agg_queues; j++) { if (bus_dmamap_create(fp->rx_mbuf_tag, BUS_DMA_NOWAIT, &fp->rx_tpa_info[j].bd.m_map)) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma map for " "'fp %d rx tpa mbuf %d' (%d)\n", i, j, rc); return (1); } } /* create dma map for the spare rx tpa mbuf cluster */ if (bus_dmamap_create(fp->rx_mbuf_tag, BUS_DMA_NOWAIT, &fp->rx_tpa_info_mbuf_spare_map)) { /* XXX unwind and free previous fastpath allocations */ BLOGE(sc, "Failed to create dma map for " "'fp %d spare rx tpa mbuf' (%d)\n", i, rc); return (1); } bxe_init_sge_ring_bit_mask(fp); } return (0); } static void bxe_free_hsi_mem(struct bxe_softc *sc) { struct bxe_fastpath *fp; int max_agg_queues; int i, j; if (sc->parent_dma_tag == NULL) { return; /* assume nothing was allocated */ } for (i = 0; i < sc->num_queues; i++) { fp = &sc->fp[i]; /*******************/ /* FP STATUS BLOCK */ /*******************/ bxe_dma_free(sc, &fp->sb_dma); memset(&fp->status_block, 0, sizeof(fp->status_block)); /******************/ /* FP TX BD CHAIN */ /******************/ bxe_dma_free(sc, &fp->tx_dma); fp->tx_chain = NULL; /******************/ /* FP RX BD CHAIN */ /******************/ bxe_dma_free(sc, &fp->rx_dma); fp->rx_chain = NULL; /*******************/ /* FP RX RCQ CHAIN */ /*******************/ bxe_dma_free(sc, &fp->rcq_dma); fp->rcq_chain = NULL; /*******************/ /* FP RX SGE CHAIN */ /*******************/ bxe_dma_free(sc, &fp->rx_sge_dma); fp->rx_sge_chain = NULL; /***********************/ /* FP TX MBUF DMA MAPS */ /***********************/ if (fp->tx_mbuf_tag != NULL) { for (j = 0; j < TX_BD_TOTAL; j++) { if (fp->tx_mbuf_chain[j].m_map != NULL) { bus_dmamap_unload(fp->tx_mbuf_tag, fp->tx_mbuf_chain[j].m_map); bus_dmamap_destroy(fp->tx_mbuf_tag, fp->tx_mbuf_chain[j].m_map); } } bus_dma_tag_destroy(fp->tx_mbuf_tag); fp->tx_mbuf_tag = NULL; } /***********************/ /* FP RX MBUF DMA MAPS */ /***********************/ if (fp->rx_mbuf_tag != NULL) { for (j = 0; j < RX_BD_TOTAL; j++) { if (fp->rx_mbuf_chain[j].m_map != NULL) { bus_dmamap_unload(fp->rx_mbuf_tag, fp->rx_mbuf_chain[j].m_map); bus_dmamap_destroy(fp->rx_mbuf_tag, fp->rx_mbuf_chain[j].m_map); } } if (fp->rx_mbuf_spare_map != NULL) { bus_dmamap_unload(fp->rx_mbuf_tag, fp->rx_mbuf_spare_map); bus_dmamap_destroy(fp->rx_mbuf_tag, fp->rx_mbuf_spare_map); } /***************************/ /* FP RX TPA MBUF DMA MAPS */ /***************************/ max_agg_queues = MAX_AGG_QS(sc); for (j = 0; j < max_agg_queues; j++) { if (fp->rx_tpa_info[j].bd.m_map != NULL) { bus_dmamap_unload(fp->rx_mbuf_tag, fp->rx_tpa_info[j].bd.m_map); bus_dmamap_destroy(fp->rx_mbuf_tag, fp->rx_tpa_info[j].bd.m_map); } } if (fp->rx_tpa_info_mbuf_spare_map != NULL) { bus_dmamap_unload(fp->rx_mbuf_tag, fp->rx_tpa_info_mbuf_spare_map); bus_dmamap_destroy(fp->rx_mbuf_tag, fp->rx_tpa_info_mbuf_spare_map); } bus_dma_tag_destroy(fp->rx_mbuf_tag); fp->rx_mbuf_tag = NULL; } /***************************/ /* FP RX SGE MBUF DMA MAPS */ /***************************/ if (fp->rx_sge_mbuf_tag != NULL) { for (j = 0; j < RX_SGE_TOTAL; j++) { if (fp->rx_sge_mbuf_chain[j].m_map != NULL) { bus_dmamap_unload(fp->rx_sge_mbuf_tag, fp->rx_sge_mbuf_chain[j].m_map); bus_dmamap_destroy(fp->rx_sge_mbuf_tag, fp->rx_sge_mbuf_chain[j].m_map); } } if (fp->rx_sge_mbuf_spare_map != NULL) { bus_dmamap_unload(fp->rx_sge_mbuf_tag, fp->rx_sge_mbuf_spare_map); bus_dmamap_destroy(fp->rx_sge_mbuf_tag, fp->rx_sge_mbuf_spare_map); } bus_dma_tag_destroy(fp->rx_sge_mbuf_tag); fp->rx_sge_mbuf_tag = NULL; } } /***************************/ /* FW DECOMPRESSION BUFFER */ /***************************/ bxe_dma_free(sc, &sc->gz_buf_dma); sc->gz_buf = NULL; free(sc->gz_strm, M_DEVBUF); sc->gz_strm = NULL; /*******************/ /* SLOW PATH QUEUE */ /*******************/ bxe_dma_free(sc, &sc->spq_dma); sc->spq = NULL; /*************/ /* SLOW PATH */ /*************/ bxe_dma_free(sc, &sc->sp_dma); sc->sp = NULL; /***************/ /* EVENT QUEUE */ /***************/ bxe_dma_free(sc, &sc->eq_dma); sc->eq = NULL; /************************/ /* DEFAULT STATUS BLOCK */ /************************/ bxe_dma_free(sc, &sc->def_sb_dma); sc->def_sb = NULL; bus_dma_tag_destroy(sc->parent_dma_tag); sc->parent_dma_tag = NULL; } /* * Previous driver DMAE transaction may have occurred when pre-boot stage * ended and boot began. This would invalidate the addresses of the * transaction, resulting in was-error bit set in the PCI causing all * hw-to-host PCIe transactions to timeout. If this happened we want to clear * the interrupt which detected this from the pglueb and the was-done bit */ static void bxe_prev_interrupted_dmae(struct bxe_softc *sc) { uint32_t val; if (!CHIP_IS_E1x(sc)) { val = REG_RD(sc, PGLUE_B_REG_PGLUE_B_INT_STS); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_WAS_ERROR_ATTN) { BLOGD(sc, DBG_LOAD, "Clearing 'was-error' bit that was set in pglueb"); REG_WR(sc, PGLUE_B_REG_WAS_ERROR_PF_7_0_CLR, 1 << SC_FUNC(sc)); } } } static int bxe_prev_mcp_done(struct bxe_softc *sc) { uint32_t rc = bxe_fw_command(sc, DRV_MSG_CODE_UNLOAD_DONE, DRV_MSG_CODE_UNLOAD_SKIP_LINK_RESET); if (!rc) { BLOGE(sc, "MCP response failure, aborting\n"); return (-1); } return (0); } static struct bxe_prev_list_node * bxe_prev_path_get_entry(struct bxe_softc *sc) { struct bxe_prev_list_node *tmp; LIST_FOREACH(tmp, &bxe_prev_list, node) { if ((sc->pcie_bus == tmp->bus) && (sc->pcie_device == tmp->slot) && (SC_PATH(sc) == tmp->path)) { return (tmp); } } return (NULL); } static uint8_t bxe_prev_is_path_marked(struct bxe_softc *sc) { struct bxe_prev_list_node *tmp; int rc = FALSE; mtx_lock(&bxe_prev_mtx); tmp = bxe_prev_path_get_entry(sc); if (tmp) { if (tmp->aer) { BLOGD(sc, DBG_LOAD, "Path %d/%d/%d was marked by AER\n", sc->pcie_bus, sc->pcie_device, SC_PATH(sc)); } else { rc = TRUE; BLOGD(sc, DBG_LOAD, "Path %d/%d/%d was already cleaned from previous drivers\n", sc->pcie_bus, sc->pcie_device, SC_PATH(sc)); } } mtx_unlock(&bxe_prev_mtx); return (rc); } static int bxe_prev_mark_path(struct bxe_softc *sc, uint8_t after_undi) { struct bxe_prev_list_node *tmp; mtx_lock(&bxe_prev_mtx); /* Check whether the entry for this path already exists */ tmp = bxe_prev_path_get_entry(sc); if (tmp) { if (!tmp->aer) { BLOGD(sc, DBG_LOAD, "Re-marking AER in path %d/%d/%d\n", sc->pcie_bus, sc->pcie_device, SC_PATH(sc)); } else { BLOGD(sc, DBG_LOAD, "Removing AER indication from path %d/%d/%d\n", sc->pcie_bus, sc->pcie_device, SC_PATH(sc)); tmp->aer = 0; } mtx_unlock(&bxe_prev_mtx); return (0); } mtx_unlock(&bxe_prev_mtx); /* Create an entry for this path and add it */ tmp = malloc(sizeof(struct bxe_prev_list_node), M_DEVBUF, (M_NOWAIT | M_ZERO)); if (!tmp) { BLOGE(sc, "Failed to allocate 'bxe_prev_list_node'\n"); return (-1); } tmp->bus = sc->pcie_bus; tmp->slot = sc->pcie_device; tmp->path = SC_PATH(sc); tmp->aer = 0; tmp->undi = after_undi ? (1 << SC_PORT(sc)) : 0; mtx_lock(&bxe_prev_mtx); BLOGD(sc, DBG_LOAD, "Marked path %d/%d/%d - finished previous unload\n", sc->pcie_bus, sc->pcie_device, SC_PATH(sc)); LIST_INSERT_HEAD(&bxe_prev_list, tmp, node); mtx_unlock(&bxe_prev_mtx); return (0); } static int bxe_do_flr(struct bxe_softc *sc) { int i; /* only E2 and onwards support FLR */ if (CHIP_IS_E1x(sc)) { BLOGD(sc, DBG_LOAD, "FLR not supported in E1/E1H\n"); return (-1); } /* only bootcode REQ_BC_VER_4_INITIATE_FLR and onwards support flr */ if (sc->devinfo.bc_ver < REQ_BC_VER_4_INITIATE_FLR) { BLOGD(sc, DBG_LOAD, "FLR not supported by BC_VER: 0x%08x\n", sc->devinfo.bc_ver); return (-1); } /* Wait for Transaction Pending bit clean */ for (i = 0; i < 4; i++) { if (i) { DELAY(((1 << (i - 1)) * 100) * 1000); } if (!bxe_is_pcie_pending(sc)) { goto clear; } } BLOGE(sc, "PCIE transaction is not cleared, " "proceeding with reset anyway\n"); clear: BLOGD(sc, DBG_LOAD, "Initiating FLR\n"); bxe_fw_command(sc, DRV_MSG_CODE_INITIATE_FLR, 0); return (0); } struct bxe_mac_vals { uint32_t xmac_addr; uint32_t xmac_val; uint32_t emac_addr; uint32_t emac_val; uint32_t umac_addr; uint32_t umac_val; uint32_t bmac_addr; uint32_t bmac_val[2]; }; static void bxe_prev_unload_close_mac(struct bxe_softc *sc, struct bxe_mac_vals *vals) { uint32_t val, base_addr, offset, mask, reset_reg; uint8_t mac_stopped = FALSE; uint8_t port = SC_PORT(sc); uint32_t wb_data[2]; /* reset addresses as they also mark which values were changed */ vals->bmac_addr = 0; vals->umac_addr = 0; vals->xmac_addr = 0; vals->emac_addr = 0; reset_reg = REG_RD(sc, MISC_REG_RESET_REG_2); if (!CHIP_IS_E3(sc)) { val = REG_RD(sc, NIG_REG_BMAC0_REGS_OUT_EN + port * 4); mask = MISC_REGISTERS_RESET_REG_2_RST_BMAC0 << port; if ((mask & reset_reg) && val) { BLOGD(sc, DBG_LOAD, "Disable BMAC Rx\n"); base_addr = SC_PORT(sc) ? NIG_REG_INGRESS_BMAC1_MEM : NIG_REG_INGRESS_BMAC0_MEM; offset = CHIP_IS_E2(sc) ? BIGMAC2_REGISTER_BMAC_CONTROL : BIGMAC_REGISTER_BMAC_CONTROL; /* * use rd/wr since we cannot use dmae. This is safe * since MCP won't access the bus due to the request * to unload, and no function on the path can be * loaded at this time. */ wb_data[0] = REG_RD(sc, base_addr + offset); wb_data[1] = REG_RD(sc, base_addr + offset + 0x4); vals->bmac_addr = base_addr + offset; vals->bmac_val[0] = wb_data[0]; vals->bmac_val[1] = wb_data[1]; wb_data[0] &= ~ELINK_BMAC_CONTROL_RX_ENABLE; REG_WR(sc, vals->bmac_addr, wb_data[0]); REG_WR(sc, vals->bmac_addr + 0x4, wb_data[1]); } BLOGD(sc, DBG_LOAD, "Disable EMAC Rx\n"); vals->emac_addr = NIG_REG_NIG_EMAC0_EN + SC_PORT(sc)*4; vals->emac_val = REG_RD(sc, vals->emac_addr); REG_WR(sc, vals->emac_addr, 0); mac_stopped = TRUE; } else { if (reset_reg & MISC_REGISTERS_RESET_REG_2_XMAC) { BLOGD(sc, DBG_LOAD, "Disable XMAC Rx\n"); base_addr = SC_PORT(sc) ? GRCBASE_XMAC1 : GRCBASE_XMAC0; val = REG_RD(sc, base_addr + XMAC_REG_PFC_CTRL_HI); REG_WR(sc, base_addr + XMAC_REG_PFC_CTRL_HI, val & ~(1 << 1)); REG_WR(sc, base_addr + XMAC_REG_PFC_CTRL_HI, val | (1 << 1)); vals->xmac_addr = base_addr + XMAC_REG_CTRL; vals->xmac_val = REG_RD(sc, vals->xmac_addr); REG_WR(sc, vals->xmac_addr, 0); mac_stopped = TRUE; } mask = MISC_REGISTERS_RESET_REG_2_UMAC0 << port; if (mask & reset_reg) { BLOGD(sc, DBG_LOAD, "Disable UMAC Rx\n"); base_addr = SC_PORT(sc) ? GRCBASE_UMAC1 : GRCBASE_UMAC0; vals->umac_addr = base_addr + UMAC_REG_COMMAND_CONFIG; vals->umac_val = REG_RD(sc, vals->umac_addr); REG_WR(sc, vals->umac_addr, 0); mac_stopped = TRUE; } } if (mac_stopped) { DELAY(20000); } } #define BXE_PREV_UNDI_PROD_ADDR(p) (BAR_TSTRORM_INTMEM + 0x1508 + ((p) << 4)) #define BXE_PREV_UNDI_RCQ(val) ((val) & 0xffff) #define BXE_PREV_UNDI_BD(val) ((val) >> 16 & 0xffff) #define BXE_PREV_UNDI_PROD(rcq, bd) ((bd) << 16 | (rcq)) static void bxe_prev_unload_undi_inc(struct bxe_softc *sc, uint8_t port, uint8_t inc) { uint16_t rcq, bd; uint32_t tmp_reg = REG_RD(sc, BXE_PREV_UNDI_PROD_ADDR(port)); rcq = BXE_PREV_UNDI_RCQ(tmp_reg) + inc; bd = BXE_PREV_UNDI_BD(tmp_reg) + inc; tmp_reg = BXE_PREV_UNDI_PROD(rcq, bd); REG_WR(sc, BXE_PREV_UNDI_PROD_ADDR(port), tmp_reg); BLOGD(sc, DBG_LOAD, "UNDI producer [%d] rings bd -> 0x%04x, rcq -> 0x%04x\n", port, bd, rcq); } static int bxe_prev_unload_common(struct bxe_softc *sc) { uint32_t reset_reg, tmp_reg = 0, rc; uint8_t prev_undi = FALSE; struct bxe_mac_vals mac_vals; uint32_t timer_count = 1000; uint32_t prev_brb; /* * It is possible a previous function received 'common' answer, * but hasn't loaded yet, therefore creating a scenario of * multiple functions receiving 'common' on the same path. */ BLOGD(sc, DBG_LOAD, "Common unload Flow\n"); memset(&mac_vals, 0, sizeof(mac_vals)); if (bxe_prev_is_path_marked(sc)) { return (bxe_prev_mcp_done(sc)); } reset_reg = REG_RD(sc, MISC_REG_RESET_REG_1); /* Reset should be performed after BRB is emptied */ if (reset_reg & MISC_REGISTERS_RESET_REG_1_RST_BRB1) { /* Close the MAC Rx to prevent BRB from filling up */ bxe_prev_unload_close_mac(sc, &mac_vals); /* close LLH filters towards the BRB */ elink_set_rx_filter(&sc->link_params, 0); /* * Check if the UNDI driver was previously loaded. * UNDI driver initializes CID offset for normal bell to 0x7 */ if (reset_reg & MISC_REGISTERS_RESET_REG_1_RST_DORQ) { tmp_reg = REG_RD(sc, DORQ_REG_NORM_CID_OFST); if (tmp_reg == 0x7) { BLOGD(sc, DBG_LOAD, "UNDI previously loaded\n"); prev_undi = TRUE; /* clear the UNDI indication */ REG_WR(sc, DORQ_REG_NORM_CID_OFST, 0); /* clear possible idle check errors */ REG_RD(sc, NIG_REG_NIG_INT_STS_CLR_0); } } /* wait until BRB is empty */ tmp_reg = REG_RD(sc, BRB1_REG_NUM_OF_FULL_BLOCKS); while (timer_count) { prev_brb = tmp_reg; tmp_reg = REG_RD(sc, BRB1_REG_NUM_OF_FULL_BLOCKS); if (!tmp_reg) { break; } BLOGD(sc, DBG_LOAD, "BRB still has 0x%08x\n", tmp_reg); /* reset timer as long as BRB actually gets emptied */ if (prev_brb > tmp_reg) { timer_count = 1000; } else { timer_count--; } /* If UNDI resides in memory, manually increment it */ if (prev_undi) { bxe_prev_unload_undi_inc(sc, SC_PORT(sc), 1); } DELAY(10); } if (!timer_count) { BLOGE(sc, "Failed to empty BRB\n"); } } /* No packets are in the pipeline, path is ready for reset */ bxe_reset_common(sc); if (mac_vals.xmac_addr) { REG_WR(sc, mac_vals.xmac_addr, mac_vals.xmac_val); } if (mac_vals.umac_addr) { REG_WR(sc, mac_vals.umac_addr, mac_vals.umac_val); } if (mac_vals.emac_addr) { REG_WR(sc, mac_vals.emac_addr, mac_vals.emac_val); } if (mac_vals.bmac_addr) { REG_WR(sc, mac_vals.bmac_addr, mac_vals.bmac_val[0]); REG_WR(sc, mac_vals.bmac_addr + 4, mac_vals.bmac_val[1]); } rc = bxe_prev_mark_path(sc, prev_undi); if (rc) { bxe_prev_mcp_done(sc); return (rc); } return (bxe_prev_mcp_done(sc)); } static int bxe_prev_unload_uncommon(struct bxe_softc *sc) { int rc; BLOGD(sc, DBG_LOAD, "Uncommon unload Flow\n"); /* Test if previous unload process was already finished for this path */ if (bxe_prev_is_path_marked(sc)) { return (bxe_prev_mcp_done(sc)); } BLOGD(sc, DBG_LOAD, "Path is unmarked\n"); /* * If function has FLR capabilities, and existing FW version matches * the one required, then FLR will be sufficient to clean any residue * left by previous driver */ rc = bxe_nic_load_analyze_req(sc, FW_MSG_CODE_DRV_LOAD_FUNCTION); if (!rc) { /* fw version is good */ BLOGD(sc, DBG_LOAD, "FW version matches our own, attempting FLR\n"); rc = bxe_do_flr(sc); } if (!rc) { /* FLR was performed */ BLOGD(sc, DBG_LOAD, "FLR successful\n"); return (0); } BLOGD(sc, DBG_LOAD, "Could not FLR\n"); /* Close the MCP request, return failure*/ rc = bxe_prev_mcp_done(sc); if (!rc) { rc = BXE_PREV_WAIT_NEEDED; } return (rc); } static int bxe_prev_unload(struct bxe_softc *sc) { int time_counter = 10; uint32_t fw, hw_lock_reg, hw_lock_val; uint32_t rc = 0; /* * Clear HW from errors which may have resulted from an interrupted * DMAE transaction. */ bxe_prev_interrupted_dmae(sc); /* Release previously held locks */ hw_lock_reg = (SC_FUNC(sc) <= 5) ? (MISC_REG_DRIVER_CONTROL_1 + SC_FUNC(sc) * 8) : (MISC_REG_DRIVER_CONTROL_7 + (SC_FUNC(sc) - 6) * 8); hw_lock_val = (REG_RD(sc, hw_lock_reg)); if (hw_lock_val) { if (hw_lock_val & HW_LOCK_RESOURCE_NVRAM) { BLOGD(sc, DBG_LOAD, "Releasing previously held NVRAM lock\n"); REG_WR(sc, MCP_REG_MCPR_NVM_SW_ARB, (MCPR_NVM_SW_ARB_ARB_REQ_CLR1 << SC_PORT(sc))); } BLOGD(sc, DBG_LOAD, "Releasing previously held HW lock\n"); REG_WR(sc, hw_lock_reg, 0xffffffff); } else { BLOGD(sc, DBG_LOAD, "No need to release HW/NVRAM locks\n"); } if (MCPR_ACCESS_LOCK_LOCK & REG_RD(sc, MCP_REG_MCPR_ACCESS_LOCK)) { BLOGD(sc, DBG_LOAD, "Releasing previously held ALR\n"); REG_WR(sc, MCP_REG_MCPR_ACCESS_LOCK, 0); } do { /* Lock MCP using an unload request */ fw = bxe_fw_command(sc, DRV_MSG_CODE_UNLOAD_REQ_WOL_DIS, 0); if (!fw) { BLOGE(sc, "MCP response failure, aborting\n"); rc = -1; break; } if (fw == FW_MSG_CODE_DRV_UNLOAD_COMMON) { rc = bxe_prev_unload_common(sc); break; } /* non-common reply from MCP night require looping */ rc = bxe_prev_unload_uncommon(sc); if (rc != BXE_PREV_WAIT_NEEDED) { break; } DELAY(20000); } while (--time_counter); if (!time_counter || rc) { BLOGE(sc, "Failed to unload previous driver!\n"); rc = -1; } return (rc); } void bxe_dcbx_set_state(struct bxe_softc *sc, uint8_t dcb_on, uint32_t dcbx_enabled) { if (!CHIP_IS_E1x(sc)) { sc->dcb_state = dcb_on; sc->dcbx_enabled = dcbx_enabled; } else { sc->dcb_state = FALSE; sc->dcbx_enabled = BXE_DCBX_ENABLED_INVALID; } BLOGD(sc, DBG_LOAD, "DCB state [%s:%s]\n", dcb_on ? "ON" : "OFF", (dcbx_enabled == BXE_DCBX_ENABLED_OFF) ? "user-mode" : (dcbx_enabled == BXE_DCBX_ENABLED_ON_NEG_OFF) ? "on-chip static" : (dcbx_enabled == BXE_DCBX_ENABLED_ON_NEG_ON) ? "on-chip with negotiation" : "invalid"); } /* must be called after sriov-enable */ static int bxe_set_qm_cid_count(struct bxe_softc *sc) { int cid_count = BXE_L2_MAX_CID(sc); if (IS_SRIOV(sc)) { cid_count += BXE_VF_CIDS; } if (CNIC_SUPPORT(sc)) { cid_count += CNIC_CID_MAX; } return (roundup(cid_count, QM_CID_ROUND)); } static void bxe_init_multi_cos(struct bxe_softc *sc) { int pri, cos; uint32_t pri_map = 0; /* XXX change to user config */ for (pri = 0; pri < BXE_MAX_PRIORITY; pri++) { cos = ((pri_map & (0xf << (pri * 4))) >> (pri * 4)); if (cos < sc->max_cos) { sc->prio_to_cos[pri] = cos; } else { BLOGW(sc, "Invalid COS %d for priority %d " "(max COS is %d), setting to 0\n", cos, pri, (sc->max_cos - 1)); sc->prio_to_cos[pri] = 0; } } } static int bxe_sysctl_state(SYSCTL_HANDLER_ARGS) { struct bxe_softc *sc; int error, result; result = 0; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) { return (error); } if (result == 1) { sc = (struct bxe_softc *)arg1; BLOGI(sc, "... dumping driver state ...\n"); /* XXX */ } return (error); } static int bxe_sysctl_eth_stat(SYSCTL_HANDLER_ARGS) { struct bxe_softc *sc = (struct bxe_softc *)arg1; uint32_t *eth_stats = (uint32_t *)&sc->eth_stats; uint32_t *offset; uint64_t value = 0; int index = (int)arg2; if (index >= BXE_NUM_ETH_STATS) { BLOGE(sc, "bxe_eth_stats index out of range (%d)\n", index); return (-1); } offset = (eth_stats + bxe_eth_stats_arr[index].offset); switch (bxe_eth_stats_arr[index].size) { case 4: value = (uint64_t)*offset; break; case 8: value = HILO_U64(*offset, *(offset + 1)); break; default: BLOGE(sc, "Invalid bxe_eth_stats size (index=%d size=%d)\n", index, bxe_eth_stats_arr[index].size); return (-1); } return (sysctl_handle_64(oidp, &value, 0, req)); } static int bxe_sysctl_eth_q_stat(SYSCTL_HANDLER_ARGS) { struct bxe_softc *sc = (struct bxe_softc *)arg1; uint32_t *eth_stats; uint32_t *offset; uint64_t value = 0; uint32_t q_stat = (uint32_t)arg2; uint32_t fp_index = ((q_stat >> 16) & 0xffff); uint32_t index = (q_stat & 0xffff); eth_stats = (uint32_t *)&sc->fp[fp_index].eth_q_stats; if (index >= BXE_NUM_ETH_Q_STATS) { BLOGE(sc, "bxe_eth_q_stats index out of range (%d)\n", index); return (-1); } offset = (eth_stats + bxe_eth_q_stats_arr[index].offset); switch (bxe_eth_q_stats_arr[index].size) { case 4: value = (uint64_t)*offset; break; case 8: value = HILO_U64(*offset, *(offset + 1)); break; default: BLOGE(sc, "Invalid bxe_eth_q_stats size (index=%d size=%d)\n", index, bxe_eth_q_stats_arr[index].size); return (-1); } return (sysctl_handle_64(oidp, &value, 0, req)); } static void bxe_add_sysctls(struct bxe_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; struct sysctl_oid *queue_top, *queue; struct sysctl_oid_list *queue_top_children, *queue_children; char queue_num_buf[32]; uint32_t q_stat; int i, j; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "version", CTLFLAG_RD, BXE_DRIVER_VERSION, 0, "version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bc_version", CTLFLAG_RD, sc->devinfo.bc_ver_str, 0, "bootcode version"); snprintf(sc->fw_ver_str, sizeof(sc->fw_ver_str), "%d.%d.%d.%d", BCM_5710_FW_MAJOR_VERSION, BCM_5710_FW_MINOR_VERSION, BCM_5710_FW_REVISION_VERSION, BCM_5710_FW_ENGINEERING_VERSION); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "fw_version", CTLFLAG_RD, sc->fw_ver_str, 0, "firmware version"); snprintf(sc->mf_mode_str, sizeof(sc->mf_mode_str), "%s", ((sc->devinfo.mf_info.mf_mode == SINGLE_FUNCTION) ? "Single" : (sc->devinfo.mf_info.mf_mode == MULTI_FUNCTION_SD) ? "MF-SD" : (sc->devinfo.mf_info.mf_mode == MULTI_FUNCTION_SI) ? "MF-SI" : (sc->devinfo.mf_info.mf_mode == MULTI_FUNCTION_AFEX) ? "MF-AFEX" : "Unknown")); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "mf_mode", CTLFLAG_RD, sc->mf_mode_str, 0, "multifunction mode"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "mf_vnics", CTLFLAG_RD, &sc->devinfo.mf_info.vnics_per_port, 0, "multifunction vnics per port"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "mac_addr", CTLFLAG_RD, sc->mac_addr_str, 0, "mac address"); snprintf(sc->pci_link_str, sizeof(sc->pci_link_str), "%s x%d", ((sc->devinfo.pcie_link_speed == 1) ? "2.5GT/s" : (sc->devinfo.pcie_link_speed == 2) ? "5.0GT/s" : (sc->devinfo.pcie_link_speed == 4) ? "8.0GT/s" : "???GT/s"), sc->devinfo.pcie_link_width); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pci_link", CTLFLAG_RD, sc->pci_link_str, 0, "pci link status"); sc->debug = bxe_debug; SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "debug", CTLFLAG_RW, &sc->debug, "debug logging mode"); sc->rx_budget = bxe_rx_budget; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_budget", CTLFLAG_RW, &sc->rx_budget, 0, "rx processing budget"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "state", CTLTYPE_UINT | CTLFLAG_RW, sc, 0, bxe_sysctl_state, "IU", "dump driver state"); for (i = 0; i < BXE_NUM_ETH_STATS; i++) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, bxe_eth_stats_arr[i].string, CTLTYPE_U64 | CTLFLAG_RD, sc, i, bxe_sysctl_eth_stat, "LU", bxe_eth_stats_arr[i].string); } /* add a new parent node for all queues "dev.bxe.#.queue" */ queue_top = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "queue", CTLFLAG_RD, NULL, "queue"); queue_top_children = SYSCTL_CHILDREN(queue_top); for (i = 0; i < sc->num_queues; i++) { /* add a new parent node for a single queue "dev.bxe.#.queue.#" */ snprintf(queue_num_buf, sizeof(queue_num_buf), "%d", i); queue = SYSCTL_ADD_NODE(ctx, queue_top_children, OID_AUTO, queue_num_buf, CTLFLAG_RD, NULL, "single queue"); queue_children = SYSCTL_CHILDREN(queue); for (j = 0; j < BXE_NUM_ETH_Q_STATS; j++) { q_stat = ((i << 16) | j); SYSCTL_ADD_PROC(ctx, queue_children, OID_AUTO, bxe_eth_q_stats_arr[j].string, CTLTYPE_U64 | CTLFLAG_RD, sc, q_stat, bxe_sysctl_eth_q_stat, "LU", bxe_eth_q_stats_arr[j].string); } } } /* * Device attach function. * * Allocates device resources, performs secondary chip identification, and * initializes driver instance variables. This function is called from driver * load after a successful probe. * * Returns: * 0 = Success, >0 = Failure */ static int bxe_attach(device_t dev) { struct bxe_softc *sc; sc = device_get_softc(dev); BLOGD(sc, DBG_LOAD, "Starting attach...\n"); sc->state = BXE_STATE_CLOSED; sc->dev = dev; sc->unit = device_get_unit(dev); BLOGD(sc, DBG_LOAD, "softc = %p\n", sc); sc->pcie_bus = pci_get_bus(dev); sc->pcie_device = pci_get_slot(dev); sc->pcie_func = pci_get_function(dev); /* enable bus master capability */ pci_enable_busmaster(dev); /* get the BARs */ if (bxe_allocate_bars(sc) != 0) { return (ENXIO); } /* initialize the mutexes */ bxe_init_mutexes(sc); /* prepare the periodic callout */ callout_init(&sc->periodic_callout, 0); /* prepare the chip taskqueue */ sc->chip_tq_flags = CHIP_TQ_NONE; snprintf(sc->chip_tq_name, sizeof(sc->chip_tq_name), "bxe%d_chip_tq", sc->unit); TASK_INIT(&sc->chip_tq_task, 0, bxe_handle_chip_tq, sc); sc->chip_tq = taskqueue_create(sc->chip_tq_name, M_NOWAIT, taskqueue_thread_enqueue, &sc->chip_tq); taskqueue_start_threads(&sc->chip_tq, 1, PWAIT, /* lower priority */ "%s", sc->chip_tq_name); /* get device info and set params */ if (bxe_get_device_info(sc) != 0) { BLOGE(sc, "getting device info\n"); bxe_deallocate_bars(sc); pci_disable_busmaster(dev); return (ENXIO); } /* get final misc params */ bxe_get_params(sc); /* set the default MTU (changed via ifconfig) */ sc->mtu = ETHERMTU; bxe_set_modes_bitmap(sc); /* XXX * If in AFEX mode and the function is configured for FCoE * then bail... no L2 allowed. */ /* get phy settings from shmem and 'and' against admin settings */ bxe_get_phy_info(sc); /* initialize the FreeBSD ifnet interface */ if (bxe_init_ifnet(sc) != 0) { bxe_release_mutexes(sc); bxe_deallocate_bars(sc); pci_disable_busmaster(dev); return (ENXIO); } /* allocate device interrupts */ if (bxe_interrupt_alloc(sc) != 0) { if (sc->ifp != NULL) { ether_ifdetach(sc->ifp); } ifmedia_removeall(&sc->ifmedia); bxe_release_mutexes(sc); bxe_deallocate_bars(sc); pci_disable_busmaster(dev); return (ENXIO); } /* allocate ilt */ if (bxe_alloc_ilt_mem(sc) != 0) { bxe_interrupt_free(sc); if (sc->ifp != NULL) { ether_ifdetach(sc->ifp); } ifmedia_removeall(&sc->ifmedia); bxe_release_mutexes(sc); bxe_deallocate_bars(sc); pci_disable_busmaster(dev); return (ENXIO); } /* allocate the host hardware/software hsi structures */ if (bxe_alloc_hsi_mem(sc) != 0) { bxe_free_ilt_mem(sc); bxe_interrupt_free(sc); if (sc->ifp != NULL) { ether_ifdetach(sc->ifp); } ifmedia_removeall(&sc->ifmedia); bxe_release_mutexes(sc); bxe_deallocate_bars(sc); pci_disable_busmaster(dev); return (ENXIO); } /* need to reset chip if UNDI was active */ if (IS_PF(sc) && !BXE_NOMCP(sc)) { /* init fw_seq */ sc->fw_seq = (SHMEM_RD(sc, func_mb[SC_FW_MB_IDX(sc)].drv_mb_header) & DRV_MSG_SEQ_NUMBER_MASK); BLOGD(sc, DBG_LOAD, "prev unload fw_seq 0x%04x\n", sc->fw_seq); bxe_prev_unload(sc); } #if 1 /* XXX */ bxe_dcbx_set_state(sc, FALSE, BXE_DCBX_ENABLED_OFF); #else if (SHMEM2_HAS(sc, dcbx_lldp_params_offset) && SHMEM2_HAS(sc, dcbx_lldp_dcbx_stat_offset) && SHMEM2_RD(sc, dcbx_lldp_params_offset) && SHMEM2_RD(sc, dcbx_lldp_dcbx_stat_offset)) { bxe_dcbx_set_state(sc, TRUE, BXE_DCBX_ENABLED_ON_NEG_ON); bxe_dcbx_init_params(sc); } else { bxe_dcbx_set_state(sc, FALSE, BXE_DCBX_ENABLED_OFF); } #endif /* calculate qm_cid_count */ sc->qm_cid_count = bxe_set_qm_cid_count(sc); BLOGD(sc, DBG_LOAD, "qm_cid_count=%d\n", sc->qm_cid_count); sc->max_cos = 1; bxe_init_multi_cos(sc); bxe_add_sysctls(sc); return (0); } /* * Device detach function. * * Stops the controller, resets the controller, and releases resources. * * Returns: * 0 = Success, >0 = Failure */ static int bxe_detach(device_t dev) { struct bxe_softc *sc; if_t ifp; sc = device_get_softc(dev); BLOGD(sc, DBG_LOAD, "Starting detach...\n"); ifp = sc->ifp; if (ifp != NULL && if_vlantrunkinuse(ifp)) { BLOGE(sc, "Cannot detach while VLANs are in use.\n"); return(EBUSY); } /* stop the periodic callout */ bxe_periodic_stop(sc); /* stop the chip taskqueue */ atomic_store_rel_long(&sc->chip_tq_flags, CHIP_TQ_NONE); if (sc->chip_tq) { taskqueue_drain(sc->chip_tq, &sc->chip_tq_task); taskqueue_free(sc->chip_tq); sc->chip_tq = NULL; } /* stop and reset the controller if it was open */ if (sc->state != BXE_STATE_CLOSED) { BXE_CORE_LOCK(sc); bxe_nic_unload(sc, UNLOAD_CLOSE, TRUE); BXE_CORE_UNLOCK(sc); } /* release the network interface */ if (ifp != NULL) { ether_ifdetach(ifp); } ifmedia_removeall(&sc->ifmedia); /* XXX do the following based on driver state... */ /* free the host hardware/software hsi structures */ bxe_free_hsi_mem(sc); /* free ilt */ bxe_free_ilt_mem(sc); /* release the interrupts */ bxe_interrupt_free(sc); /* Release the mutexes*/ bxe_release_mutexes(sc); /* Release the PCIe BAR mapped memory */ bxe_deallocate_bars(sc); /* Release the FreeBSD interface. */ if (sc->ifp != NULL) { if_free(sc->ifp); } pci_disable_busmaster(dev); return (0); } /* * Device shutdown function. * * Stops and resets the controller. * * Returns: * Nothing */ static int bxe_shutdown(device_t dev) { struct bxe_softc *sc; sc = device_get_softc(dev); BLOGD(sc, DBG_LOAD, "Starting shutdown...\n"); /* stop the periodic callout */ bxe_periodic_stop(sc); BXE_CORE_LOCK(sc); bxe_nic_unload(sc, UNLOAD_NORMAL, FALSE); BXE_CORE_UNLOCK(sc); return (0); } void bxe_igu_ack_sb(struct bxe_softc *sc, uint8_t igu_sb_id, uint8_t segment, uint16_t index, uint8_t op, uint8_t update) { uint32_t igu_addr = sc->igu_base_addr; igu_addr += (IGU_CMD_INT_ACK_BASE + igu_sb_id)*8; bxe_igu_ack_sb_gen(sc, igu_sb_id, segment, index, op, update, igu_addr); } static void bxe_igu_clear_sb_gen(struct bxe_softc *sc, uint8_t func, uint8_t idu_sb_id, uint8_t is_pf) { uint32_t data, ctl, cnt = 100; uint32_t igu_addr_data = IGU_REG_COMMAND_REG_32LSB_DATA; uint32_t igu_addr_ctl = IGU_REG_COMMAND_REG_CTRL; uint32_t igu_addr_ack = IGU_REG_CSTORM_TYPE_0_SB_CLEANUP + (idu_sb_id/32)*4; uint32_t sb_bit = 1 << (idu_sb_id%32); uint32_t func_encode = func | (is_pf ? 1 : 0) << IGU_FID_ENCODE_IS_PF_SHIFT; uint32_t addr_encode = IGU_CMD_E2_PROD_UPD_BASE + idu_sb_id; /* Not supported in BC mode */ if (CHIP_INT_MODE_IS_BC(sc)) { return; } data = ((IGU_USE_REGISTER_cstorm_type_0_sb_cleanup << IGU_REGULAR_CLEANUP_TYPE_SHIFT) | IGU_REGULAR_CLEANUP_SET | IGU_REGULAR_BCLEANUP); ctl = ((addr_encode << IGU_CTRL_REG_ADDRESS_SHIFT) | (func_encode << IGU_CTRL_REG_FID_SHIFT) | (IGU_CTRL_CMD_TYPE_WR << IGU_CTRL_REG_TYPE_SHIFT)); BLOGD(sc, DBG_LOAD, "write 0x%08x to IGU(via GRC) addr 0x%x\n", data, igu_addr_data); REG_WR(sc, igu_addr_data, data); bus_space_barrier(sc->bar[BAR0].tag, sc->bar[BAR0].handle, 0, 0, BUS_SPACE_BARRIER_WRITE); mb(); BLOGD(sc, DBG_LOAD, "write 0x%08x to IGU(via GRC) addr 0x%x\n", ctl, igu_addr_ctl); REG_WR(sc, igu_addr_ctl, ctl); bus_space_barrier(sc->bar[BAR0].tag, sc->bar[BAR0].handle, 0, 0, BUS_SPACE_BARRIER_WRITE); mb(); /* wait for clean up to finish */ while (!(REG_RD(sc, igu_addr_ack) & sb_bit) && --cnt) { DELAY(20000); } if (!(REG_RD(sc, igu_addr_ack) & sb_bit)) { BLOGD(sc, DBG_LOAD, "Unable to finish IGU cleanup: " "idu_sb_id %d offset %d bit %d (cnt %d)\n", idu_sb_id, idu_sb_id/32, idu_sb_id%32, cnt); } } static void bxe_igu_clear_sb(struct bxe_softc *sc, uint8_t idu_sb_id) { bxe_igu_clear_sb_gen(sc, SC_FUNC(sc), idu_sb_id, TRUE /*PF*/); } /*******************/ /* ECORE CALLBACKS */ /*******************/ static void bxe_reset_common(struct bxe_softc *sc) { uint32_t val = 0x1400; /* reset_common */ REG_WR(sc, (GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_CLEAR), 0xd3ffff7f); if (CHIP_IS_E3(sc)) { val |= MISC_REGISTERS_RESET_REG_2_MSTAT0; val |= MISC_REGISTERS_RESET_REG_2_MSTAT1; } REG_WR(sc, (GRCBASE_MISC + MISC_REGISTERS_RESET_REG_2_CLEAR), val); } static void bxe_common_init_phy(struct bxe_softc *sc) { uint32_t shmem_base[2]; uint32_t shmem2_base[2]; /* Avoid common init in case MFW supports LFA */ if (SHMEM2_RD(sc, size) > (uint32_t)offsetof(struct shmem2_region, lfa_host_addr[SC_PORT(sc)])) { return; } shmem_base[0] = sc->devinfo.shmem_base; shmem2_base[0] = sc->devinfo.shmem2_base; if (!CHIP_IS_E1x(sc)) { shmem_base[1] = SHMEM2_RD(sc, other_shmem_base_addr); shmem2_base[1] = SHMEM2_RD(sc, other_shmem2_base_addr); } BXE_PHY_LOCK(sc); elink_common_init_phy(sc, shmem_base, shmem2_base, sc->devinfo.chip_id, 0); BXE_PHY_UNLOCK(sc); } static void bxe_pf_disable(struct bxe_softc *sc) { uint32_t val = REG_RD(sc, IGU_REG_PF_CONFIGURATION); val &= ~IGU_PF_CONF_FUNC_EN; REG_WR(sc, IGU_REG_PF_CONFIGURATION, val); REG_WR(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, 0); REG_WR(sc, CFC_REG_WEAK_ENABLE_PF, 0); } static void bxe_init_pxp(struct bxe_softc *sc) { uint16_t devctl; int r_order, w_order; devctl = bxe_pcie_capability_read(sc, PCIR_EXPRESS_DEVICE_CTL, 2); BLOGD(sc, DBG_LOAD, "read 0x%08x from devctl\n", devctl); w_order = ((devctl & PCIM_EXP_CTL_MAX_PAYLOAD) >> 5); if (sc->mrrs == -1) { r_order = ((devctl & PCIM_EXP_CTL_MAX_READ_REQUEST) >> 12); } else { BLOGD(sc, DBG_LOAD, "forcing read order to %d\n", sc->mrrs); r_order = sc->mrrs; } ecore_init_pxp_arb(sc, r_order, w_order); } static uint32_t bxe_get_pretend_reg(struct bxe_softc *sc) { uint32_t base = PXP2_REG_PGL_PRETEND_FUNC_F0; uint32_t stride = (PXP2_REG_PGL_PRETEND_FUNC_F1 - base); return (base + (SC_ABS_FUNC(sc)) * stride); } /* * Called only on E1H or E2. * When pretending to be PF, the pretend value is the function number 0..7. * When pretending to be VF, the pretend val is the PF-num:VF-valid:ABS-VFID * combination. */ static int bxe_pretend_func(struct bxe_softc *sc, uint16_t pretend_func_val) { uint32_t pretend_reg; if (CHIP_IS_E1H(sc) && (pretend_func_val > E1H_FUNC_MAX)) { return (-1); } /* get my own pretend register */ pretend_reg = bxe_get_pretend_reg(sc); REG_WR(sc, pretend_reg, pretend_func_val); REG_RD(sc, pretend_reg); return (0); } static void bxe_iov_init_dmae(struct bxe_softc *sc) { return; #if 0 BLOGD(sc, DBG_LOAD, "SRIOV is %s\n", IS_SRIOV(sc) ? "ON" : "OFF"); if (!IS_SRIOV(sc)) { return; } REG_WR(sc, DMAE_REG_BACKWARD_COMP_EN, 0); #endif } #if 0 static int bxe_iov_init_ilt(struct bxe_softc *sc, uint16_t line) { return (line); #if 0 int i; struct ecore_ilt* ilt = sc->ilt; if (!IS_SRIOV(sc)) { return (line); } /* set vfs ilt lines */ for (i = 0; i < BXE_VF_CIDS/ILT_PAGE_CIDS ; i++) { struct hw_dma *hw_cxt = SC_VF_CXT_PAGE(sc,i); ilt->lines[line+i].page = hw_cxt->addr; ilt->lines[line+i].page_mapping = hw_cxt->mapping; ilt->lines[line+i].size = hw_cxt->size; /* doesn't matter */ } return (line+i); #endif } #endif static void bxe_iov_init_dq(struct bxe_softc *sc) { return; #if 0 if (!IS_SRIOV(sc)) { return; } /* Set the DQ such that the CID reflect the abs_vfid */ REG_WR(sc, DORQ_REG_VF_NORM_VF_BASE, 0); REG_WR(sc, DORQ_REG_MAX_RVFID_SIZE, ilog2(BNX2X_MAX_NUM_OF_VFS)); /* * Set VFs starting CID. If its > 0 the preceding CIDs are belong to * the PF L2 queues */ REG_WR(sc, DORQ_REG_VF_NORM_CID_BASE, BNX2X_FIRST_VF_CID); /* The VF window size is the log2 of the max number of CIDs per VF */ REG_WR(sc, DORQ_REG_VF_NORM_CID_WND_SIZE, BNX2X_VF_CID_WND); /* * The VF doorbell size 0 - *B, 4 - 128B. We set it here to match * the Pf doorbell size although the 2 are independent. */ REG_WR(sc, DORQ_REG_VF_NORM_CID_OFST, BNX2X_DB_SHIFT - BNX2X_DB_MIN_SHIFT); /* * No security checks for now - * configure single rule (out of 16) mask = 0x1, value = 0x0, * CID range 0 - 0x1ffff */ REG_WR(sc, DORQ_REG_VF_TYPE_MASK_0, 1); REG_WR(sc, DORQ_REG_VF_TYPE_VALUE_0, 0); REG_WR(sc, DORQ_REG_VF_TYPE_MIN_MCID_0, 0); REG_WR(sc, DORQ_REG_VF_TYPE_MAX_MCID_0, 0x1ffff); /* set the number of VF alllowed doorbells to the full DQ range */ REG_WR(sc, DORQ_REG_VF_NORM_MAX_CID_COUNT, 0x20000); /* set the VF doorbell threshold */ REG_WR(sc, DORQ_REG_VF_USAGE_CT_LIMIT, 4); #endif } /* send a NIG loopback debug packet */ static void bxe_lb_pckt(struct bxe_softc *sc) { uint32_t wb_write[3]; /* Ethernet source and destination addresses */ wb_write[0] = 0x55555555; wb_write[1] = 0x55555555; wb_write[2] = 0x20; /* SOP */ REG_WR_DMAE(sc, NIG_REG_DEBUG_PACKET_LB, wb_write, 3); /* NON-IP protocol */ wb_write[0] = 0x09000000; wb_write[1] = 0x55555555; wb_write[2] = 0x10; /* EOP, eop_bvalid = 0 */ REG_WR_DMAE(sc, NIG_REG_DEBUG_PACKET_LB, wb_write, 3); } /* * Some of the internal memories are not directly readable from the driver. * To test them we send debug packets. */ static int bxe_int_mem_test(struct bxe_softc *sc) { int factor; int count, i; uint32_t val = 0; if (CHIP_REV_IS_FPGA(sc)) { factor = 120; } else if (CHIP_REV_IS_EMUL(sc)) { factor = 200; } else { factor = 1; } /* disable inputs of parser neighbor blocks */ REG_WR(sc, TSDM_REG_ENABLE_IN1, 0x0); REG_WR(sc, TCM_REG_PRS_IFEN, 0x0); REG_WR(sc, CFC_REG_DEBUG0, 0x1); REG_WR(sc, NIG_REG_PRS_REQ_IN_EN, 0x0); /* write 0 to parser credits for CFC search request */ REG_WR(sc, PRS_REG_CFC_SEARCH_INITIAL_CREDIT, 0x0); /* send Ethernet packet */ bxe_lb_pckt(sc); /* TODO do i reset NIG statistic? */ /* Wait until NIG register shows 1 packet of size 0x10 */ count = 1000 * factor; while (count) { bxe_read_dmae(sc, NIG_REG_STAT2_BRB_OCTET, 2); val = *BXE_SP(sc, wb_data[0]); if (val == 0x10) { break; } DELAY(10000); count--; } if (val != 0x10) { BLOGE(sc, "NIG timeout val=0x%x\n", val); return (-1); } /* wait until PRS register shows 1 packet */ count = (1000 * factor); while (count) { val = REG_RD(sc, PRS_REG_NUM_OF_PACKETS); if (val == 1) { break; } DELAY(10000); count--; } if (val != 0x1) { BLOGE(sc, "PRS timeout val=0x%x\n", val); return (-2); } /* Reset and init BRB, PRS */ REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_CLEAR, 0x03); DELAY(50000); REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_SET, 0x03); DELAY(50000); ecore_init_block(sc, BLOCK_BRB1, PHASE_COMMON); ecore_init_block(sc, BLOCK_PRS, PHASE_COMMON); /* Disable inputs of parser neighbor blocks */ REG_WR(sc, TSDM_REG_ENABLE_IN1, 0x0); REG_WR(sc, TCM_REG_PRS_IFEN, 0x0); REG_WR(sc, CFC_REG_DEBUG0, 0x1); REG_WR(sc, NIG_REG_PRS_REQ_IN_EN, 0x0); /* Write 0 to parser credits for CFC search request */ REG_WR(sc, PRS_REG_CFC_SEARCH_INITIAL_CREDIT, 0x0); /* send 10 Ethernet packets */ for (i = 0; i < 10; i++) { bxe_lb_pckt(sc); } /* Wait until NIG register shows 10+1 packets of size 11*0x10 = 0xb0 */ count = (1000 * factor); while (count) { bxe_read_dmae(sc, NIG_REG_STAT2_BRB_OCTET, 2); val = *BXE_SP(sc, wb_data[0]); if (val == 0xb0) { break; } DELAY(10000); count--; } if (val != 0xb0) { BLOGE(sc, "NIG timeout val=0x%x\n", val); return (-3); } /* Wait until PRS register shows 2 packets */ val = REG_RD(sc, PRS_REG_NUM_OF_PACKETS); if (val != 2) { BLOGE(sc, "PRS timeout val=0x%x\n", val); } /* Write 1 to parser credits for CFC search request */ REG_WR(sc, PRS_REG_CFC_SEARCH_INITIAL_CREDIT, 0x1); /* Wait until PRS register shows 3 packets */ DELAY(10000 * factor); /* Wait until NIG register shows 1 packet of size 0x10 */ val = REG_RD(sc, PRS_REG_NUM_OF_PACKETS); if (val != 3) { BLOGE(sc, "PRS timeout val=0x%x\n", val); } /* clear NIG EOP FIFO */ for (i = 0; i < 11; i++) { REG_RD(sc, NIG_REG_INGRESS_EOP_LB_FIFO); } val = REG_RD(sc, NIG_REG_INGRESS_EOP_LB_EMPTY); if (val != 1) { BLOGE(sc, "clear of NIG failed\n"); return (-4); } /* Reset and init BRB, PRS, NIG */ REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_CLEAR, 0x03); DELAY(50000); REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_SET, 0x03); DELAY(50000); ecore_init_block(sc, BLOCK_BRB1, PHASE_COMMON); ecore_init_block(sc, BLOCK_PRS, PHASE_COMMON); if (!CNIC_SUPPORT(sc)) { /* set NIC mode */ REG_WR(sc, PRS_REG_NIC_MODE, 1); } /* Enable inputs of parser neighbor blocks */ REG_WR(sc, TSDM_REG_ENABLE_IN1, 0x7fffffff); REG_WR(sc, TCM_REG_PRS_IFEN, 0x1); REG_WR(sc, CFC_REG_DEBUG0, 0x0); REG_WR(sc, NIG_REG_PRS_REQ_IN_EN, 0x1); return (0); } static void bxe_setup_fan_failure_detection(struct bxe_softc *sc) { int is_required; uint32_t val; int port; is_required = 0; val = (SHMEM_RD(sc, dev_info.shared_hw_config.config2) & SHARED_HW_CFG_FAN_FAILURE_MASK); if (val == SHARED_HW_CFG_FAN_FAILURE_ENABLED) { is_required = 1; } /* * The fan failure mechanism is usually related to the PHY type since * the power consumption of the board is affected by the PHY. Currently, * fan is required for most designs with SFX7101, BCM8727 and BCM8481. */ else if (val == SHARED_HW_CFG_FAN_FAILURE_PHY_TYPE) { for (port = PORT_0; port < PORT_MAX; port++) { is_required |= elink_fan_failure_det_req(sc, sc->devinfo.shmem_base, sc->devinfo.shmem2_base, port); } } BLOGD(sc, DBG_LOAD, "fan detection setting: %d\n", is_required); if (is_required == 0) { return; } /* Fan failure is indicated by SPIO 5 */ bxe_set_spio(sc, MISC_SPIO_SPIO5, MISC_SPIO_INPUT_HI_Z); /* set to active low mode */ val = REG_RD(sc, MISC_REG_SPIO_INT); val |= (MISC_SPIO_SPIO5 << MISC_SPIO_INT_OLD_SET_POS); REG_WR(sc, MISC_REG_SPIO_INT, val); /* enable interrupt to signal the IGU */ val = REG_RD(sc, MISC_REG_SPIO_EVENT_EN); val |= MISC_SPIO_SPIO5; REG_WR(sc, MISC_REG_SPIO_EVENT_EN, val); } static void bxe_enable_blocks_attention(struct bxe_softc *sc) { uint32_t val; REG_WR(sc, PXP_REG_PXP_INT_MASK_0, 0); if (!CHIP_IS_E1x(sc)) { REG_WR(sc, PXP_REG_PXP_INT_MASK_1, 0x40); } else { REG_WR(sc, PXP_REG_PXP_INT_MASK_1, 0); } REG_WR(sc, DORQ_REG_DORQ_INT_MASK, 0); REG_WR(sc, CFC_REG_CFC_INT_MASK, 0); /* * mask read length error interrupts in brb for parser * (parsing unit and 'checksum and crc' unit) * these errors are legal (PU reads fixed length and CAC can cause * read length error on truncated packets) */ REG_WR(sc, BRB1_REG_BRB1_INT_MASK, 0xFC00); REG_WR(sc, QM_REG_QM_INT_MASK, 0); REG_WR(sc, TM_REG_TM_INT_MASK, 0); REG_WR(sc, XSDM_REG_XSDM_INT_MASK_0, 0); REG_WR(sc, XSDM_REG_XSDM_INT_MASK_1, 0); REG_WR(sc, XCM_REG_XCM_INT_MASK, 0); /* REG_WR(sc, XSEM_REG_XSEM_INT_MASK_0, 0); */ /* REG_WR(sc, XSEM_REG_XSEM_INT_MASK_1, 0); */ REG_WR(sc, USDM_REG_USDM_INT_MASK_0, 0); REG_WR(sc, USDM_REG_USDM_INT_MASK_1, 0); REG_WR(sc, UCM_REG_UCM_INT_MASK, 0); /* REG_WR(sc, USEM_REG_USEM_INT_MASK_0, 0); */ /* REG_WR(sc, USEM_REG_USEM_INT_MASK_1, 0); */ REG_WR(sc, GRCBASE_UPB + PB_REG_PB_INT_MASK, 0); REG_WR(sc, CSDM_REG_CSDM_INT_MASK_0, 0); REG_WR(sc, CSDM_REG_CSDM_INT_MASK_1, 0); REG_WR(sc, CCM_REG_CCM_INT_MASK, 0); /* REG_WR(sc, CSEM_REG_CSEM_INT_MASK_0, 0); */ /* REG_WR(sc, CSEM_REG_CSEM_INT_MASK_1, 0); */ val = (PXP2_PXP2_INT_MASK_0_REG_PGL_CPL_AFT | PXP2_PXP2_INT_MASK_0_REG_PGL_CPL_OF | PXP2_PXP2_INT_MASK_0_REG_PGL_PCIE_ATTN); if (!CHIP_IS_E1x(sc)) { val |= (PXP2_PXP2_INT_MASK_0_REG_PGL_READ_BLOCKED | PXP2_PXP2_INT_MASK_0_REG_PGL_WRITE_BLOCKED); } REG_WR(sc, PXP2_REG_PXP2_INT_MASK_0, val); REG_WR(sc, TSDM_REG_TSDM_INT_MASK_0, 0); REG_WR(sc, TSDM_REG_TSDM_INT_MASK_1, 0); REG_WR(sc, TCM_REG_TCM_INT_MASK, 0); /* REG_WR(sc, TSEM_REG_TSEM_INT_MASK_0, 0); */ if (!CHIP_IS_E1x(sc)) { /* enable VFC attentions: bits 11 and 12, bits 31:13 reserved */ REG_WR(sc, TSEM_REG_TSEM_INT_MASK_1, 0x07ff); } REG_WR(sc, CDU_REG_CDU_INT_MASK, 0); REG_WR(sc, DMAE_REG_DMAE_INT_MASK, 0); /* REG_WR(sc, MISC_REG_MISC_INT_MASK, 0); */ REG_WR(sc, PBF_REG_PBF_INT_MASK, 0x18); /* bit 3,4 masked */ } /** * bxe_init_hw_common - initialize the HW at the COMMON phase. * * @sc: driver handle */ static int bxe_init_hw_common(struct bxe_softc *sc) { uint8_t abs_func_id; uint32_t val; BLOGD(sc, DBG_LOAD, "starting common init for func %d\n", SC_ABS_FUNC(sc)); /* * take the RESET lock to protect undi_unload flow from accessing * registers while we are resetting the chip */ bxe_acquire_hw_lock(sc, HW_LOCK_RESOURCE_RESET); bxe_reset_common(sc); REG_WR(sc, (GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_SET), 0xffffffff); val = 0xfffc; if (CHIP_IS_E3(sc)) { val |= MISC_REGISTERS_RESET_REG_2_MSTAT0; val |= MISC_REGISTERS_RESET_REG_2_MSTAT1; } REG_WR(sc, (GRCBASE_MISC + MISC_REGISTERS_RESET_REG_2_SET), val); bxe_release_hw_lock(sc, HW_LOCK_RESOURCE_RESET); ecore_init_block(sc, BLOCK_MISC, PHASE_COMMON); BLOGD(sc, DBG_LOAD, "after misc block init\n"); if (!CHIP_IS_E1x(sc)) { /* * 4-port mode or 2-port mode we need to turn off master-enable for * everyone. After that we turn it back on for self. So, we disregard * multi-function, and always disable all functions on the given path, * this means 0,2,4,6 for path 0 and 1,3,5,7 for path 1 */ for (abs_func_id = SC_PATH(sc); abs_func_id < (E2_FUNC_MAX * 2); abs_func_id += 2) { if (abs_func_id == SC_ABS_FUNC(sc)) { REG_WR(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, 1); continue; } bxe_pretend_func(sc, abs_func_id); /* clear pf enable */ bxe_pf_disable(sc); bxe_pretend_func(sc, SC_ABS_FUNC(sc)); } } BLOGD(sc, DBG_LOAD, "after pf disable\n"); ecore_init_block(sc, BLOCK_PXP, PHASE_COMMON); if (CHIP_IS_E1(sc)) { /* * enable HW interrupt from PXP on USDM overflow * bit 16 on INT_MASK_0 */ REG_WR(sc, PXP_REG_PXP_INT_MASK_0, 0); } ecore_init_block(sc, BLOCK_PXP2, PHASE_COMMON); bxe_init_pxp(sc); #ifdef __BIG_ENDIAN REG_WR(sc, PXP2_REG_RQ_QM_ENDIAN_M, 1); REG_WR(sc, PXP2_REG_RQ_TM_ENDIAN_M, 1); REG_WR(sc, PXP2_REG_RQ_SRC_ENDIAN_M, 1); REG_WR(sc, PXP2_REG_RQ_CDU_ENDIAN_M, 1); REG_WR(sc, PXP2_REG_RQ_DBG_ENDIAN_M, 1); /* make sure this value is 0 */ REG_WR(sc, PXP2_REG_RQ_HC_ENDIAN_M, 0); //REG_WR(sc, PXP2_REG_RD_PBF_SWAP_MODE, 1); REG_WR(sc, PXP2_REG_RD_QM_SWAP_MODE, 1); REG_WR(sc, PXP2_REG_RD_TM_SWAP_MODE, 1); REG_WR(sc, PXP2_REG_RD_SRC_SWAP_MODE, 1); REG_WR(sc, PXP2_REG_RD_CDURD_SWAP_MODE, 1); #endif ecore_ilt_init_page_size(sc, INITOP_SET); if (CHIP_REV_IS_FPGA(sc) && CHIP_IS_E1H(sc)) { REG_WR(sc, PXP2_REG_PGL_TAGS_LIMIT, 0x1); } /* let the HW do it's magic... */ DELAY(100000); /* finish PXP init */ val = REG_RD(sc, PXP2_REG_RQ_CFG_DONE); if (val != 1) { BLOGE(sc, "PXP2 CFG failed\n"); return (-1); } val = REG_RD(sc, PXP2_REG_RD_INIT_DONE); if (val != 1) { BLOGE(sc, "PXP2 RD_INIT failed\n"); return (-1); } BLOGD(sc, DBG_LOAD, "after pxp init\n"); /* * Timer bug workaround for E2 only. We need to set the entire ILT to have * entries with value "0" and valid bit on. This needs to be done by the * first PF that is loaded in a path (i.e. common phase) */ if (!CHIP_IS_E1x(sc)) { /* * In E2 there is a bug in the timers block that can cause function 6 / 7 * (i.e. vnic3) to start even if it is marked as "scan-off". * This occurs when a different function (func2,3) is being marked * as "scan-off". Real-life scenario for example: if a driver is being * load-unloaded while func6,7 are down. This will cause the timer to access * the ilt, translate to a logical address and send a request to read/write. * Since the ilt for the function that is down is not valid, this will cause * a translation error which is unrecoverable. * The Workaround is intended to make sure that when this happens nothing * fatal will occur. The workaround: * 1. First PF driver which loads on a path will: * a. After taking the chip out of reset, by using pretend, * it will write "0" to the following registers of * the other vnics. * REG_WR(pdev, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, 0); * REG_WR(pdev, CFC_REG_WEAK_ENABLE_PF,0); * REG_WR(pdev, CFC_REG_STRONG_ENABLE_PF,0); * And for itself it will write '1' to * PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER to enable * dmae-operations (writing to pram for example.) * note: can be done for only function 6,7 but cleaner this * way. * b. Write zero+valid to the entire ILT. * c. Init the first_timers_ilt_entry, last_timers_ilt_entry of * VNIC3 (of that port). The range allocated will be the * entire ILT. This is needed to prevent ILT range error. * 2. Any PF driver load flow: * a. ILT update with the physical addresses of the allocated * logical pages. * b. Wait 20msec. - note that this timeout is needed to make * sure there are no requests in one of the PXP internal * queues with "old" ILT addresses. * c. PF enable in the PGLC. * d. Clear the was_error of the PF in the PGLC. (could have * occurred while driver was down) * e. PF enable in the CFC (WEAK + STRONG) * f. Timers scan enable * 3. PF driver unload flow: * a. Clear the Timers scan_en. * b. Polling for scan_on=0 for that PF. * c. Clear the PF enable bit in the PXP. * d. Clear the PF enable in the CFC (WEAK + STRONG) * e. Write zero+valid to all ILT entries (The valid bit must * stay set) * f. If this is VNIC 3 of a port then also init * first_timers_ilt_entry to zero and last_timers_ilt_entry * to the last enrty in the ILT. * * Notes: * Currently the PF error in the PGLC is non recoverable. * In the future the there will be a recovery routine for this error. * Currently attention is masked. * Having an MCP lock on the load/unload process does not guarantee that * there is no Timer disable during Func6/7 enable. This is because the * Timers scan is currently being cleared by the MCP on FLR. * Step 2.d can be done only for PF6/7 and the driver can also check if * there is error before clearing it. But the flow above is simpler and * more general. * All ILT entries are written by zero+valid and not just PF6/7 * ILT entries since in the future the ILT entries allocation for * PF-s might be dynamic. */ struct ilt_client_info ilt_cli; struct ecore_ilt ilt; memset(&ilt_cli, 0, sizeof(struct ilt_client_info)); memset(&ilt, 0, sizeof(struct ecore_ilt)); /* initialize dummy TM client */ ilt_cli.start = 0; ilt_cli.end = ILT_NUM_PAGE_ENTRIES - 1; ilt_cli.client_num = ILT_CLIENT_TM; /* * Step 1: set zeroes to all ilt page entries with valid bit on * Step 2: set the timers first/last ilt entry to point * to the entire range to prevent ILT range error for 3rd/4th * vnic (this code assumes existence of the vnic) * * both steps performed by call to ecore_ilt_client_init_op() * with dummy TM client * * we must use pretend since PXP2_REG_RQ_##blk##_FIRST_ILT * and his brother are split registers */ bxe_pretend_func(sc, (SC_PATH(sc) + 6)); ecore_ilt_client_init_op_ilt(sc, &ilt, &ilt_cli, INITOP_CLEAR); bxe_pretend_func(sc, SC_ABS_FUNC(sc)); REG_WR(sc, PXP2_REG_RQ_DRAM_ALIGN, BXE_PXP_DRAM_ALIGN); REG_WR(sc, PXP2_REG_RQ_DRAM_ALIGN_RD, BXE_PXP_DRAM_ALIGN); REG_WR(sc, PXP2_REG_RQ_DRAM_ALIGN_SEL, 1); } REG_WR(sc, PXP2_REG_RQ_DISABLE_INPUTS, 0); REG_WR(sc, PXP2_REG_RD_DISABLE_INPUTS, 0); if (!CHIP_IS_E1x(sc)) { int factor = CHIP_REV_IS_EMUL(sc) ? 1000 : (CHIP_REV_IS_FPGA(sc) ? 400 : 0); ecore_init_block(sc, BLOCK_PGLUE_B, PHASE_COMMON); ecore_init_block(sc, BLOCK_ATC, PHASE_COMMON); /* let the HW do it's magic... */ do { DELAY(200000); val = REG_RD(sc, ATC_REG_ATC_INIT_DONE); } while (factor-- && (val != 1)); if (val != 1) { BLOGE(sc, "ATC_INIT failed\n"); return (-1); } } BLOGD(sc, DBG_LOAD, "after pglue and atc init\n"); ecore_init_block(sc, BLOCK_DMAE, PHASE_COMMON); bxe_iov_init_dmae(sc); /* clean the DMAE memory */ sc->dmae_ready = 1; ecore_init_fill(sc, TSEM_REG_PRAM, 0, 8, 1); ecore_init_block(sc, BLOCK_TCM, PHASE_COMMON); ecore_init_block(sc, BLOCK_UCM, PHASE_COMMON); ecore_init_block(sc, BLOCK_CCM, PHASE_COMMON); ecore_init_block(sc, BLOCK_XCM, PHASE_COMMON); bxe_read_dmae(sc, XSEM_REG_PASSIVE_BUFFER, 3); bxe_read_dmae(sc, CSEM_REG_PASSIVE_BUFFER, 3); bxe_read_dmae(sc, TSEM_REG_PASSIVE_BUFFER, 3); bxe_read_dmae(sc, USEM_REG_PASSIVE_BUFFER, 3); ecore_init_block(sc, BLOCK_QM, PHASE_COMMON); /* QM queues pointers table */ ecore_qm_init_ptr_table(sc, sc->qm_cid_count, INITOP_SET); /* soft reset pulse */ REG_WR(sc, QM_REG_SOFT_RESET, 1); REG_WR(sc, QM_REG_SOFT_RESET, 0); if (CNIC_SUPPORT(sc)) ecore_init_block(sc, BLOCK_TM, PHASE_COMMON); ecore_init_block(sc, BLOCK_DORQ, PHASE_COMMON); REG_WR(sc, DORQ_REG_DPM_CID_OFST, BXE_DB_SHIFT); if (!CHIP_REV_IS_SLOW(sc)) { /* enable hw interrupt from doorbell Q */ REG_WR(sc, DORQ_REG_DORQ_INT_MASK, 0); } ecore_init_block(sc, BLOCK_BRB1, PHASE_COMMON); ecore_init_block(sc, BLOCK_PRS, PHASE_COMMON); REG_WR(sc, PRS_REG_A_PRSU_20, 0xf); if (!CHIP_IS_E1(sc)) { REG_WR(sc, PRS_REG_E1HOV_MODE, sc->devinfo.mf_info.path_has_ovlan); } if (!CHIP_IS_E1x(sc) && !CHIP_IS_E3B0(sc)) { if (IS_MF_AFEX(sc)) { /* * configure that AFEX and VLAN headers must be * received in AFEX mode */ REG_WR(sc, PRS_REG_HDRS_AFTER_BASIC, 0xE); REG_WR(sc, PRS_REG_MUST_HAVE_HDRS, 0xA); REG_WR(sc, PRS_REG_HDRS_AFTER_TAG_0, 0x6); REG_WR(sc, PRS_REG_TAG_ETHERTYPE_0, 0x8926); REG_WR(sc, PRS_REG_TAG_LEN_0, 0x4); } else { /* * Bit-map indicating which L2 hdrs may appear * after the basic Ethernet header */ REG_WR(sc, PRS_REG_HDRS_AFTER_BASIC, sc->devinfo.mf_info.path_has_ovlan ? 7 : 6); } } ecore_init_block(sc, BLOCK_TSDM, PHASE_COMMON); ecore_init_block(sc, BLOCK_CSDM, PHASE_COMMON); ecore_init_block(sc, BLOCK_USDM, PHASE_COMMON); ecore_init_block(sc, BLOCK_XSDM, PHASE_COMMON); if (!CHIP_IS_E1x(sc)) { /* reset VFC memories */ REG_WR(sc, TSEM_REG_FAST_MEMORY + VFC_REG_MEMORIES_RST, VFC_MEMORIES_RST_REG_CAM_RST | VFC_MEMORIES_RST_REG_RAM_RST); REG_WR(sc, XSEM_REG_FAST_MEMORY + VFC_REG_MEMORIES_RST, VFC_MEMORIES_RST_REG_CAM_RST | VFC_MEMORIES_RST_REG_RAM_RST); DELAY(20000); } ecore_init_block(sc, BLOCK_TSEM, PHASE_COMMON); ecore_init_block(sc, BLOCK_USEM, PHASE_COMMON); ecore_init_block(sc, BLOCK_CSEM, PHASE_COMMON); ecore_init_block(sc, BLOCK_XSEM, PHASE_COMMON); /* sync semi rtc */ REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_CLEAR, 0x80000000); REG_WR(sc, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_SET, 0x80000000); ecore_init_block(sc, BLOCK_UPB, PHASE_COMMON); ecore_init_block(sc, BLOCK_XPB, PHASE_COMMON); ecore_init_block(sc, BLOCK_PBF, PHASE_COMMON); if (!CHIP_IS_E1x(sc)) { if (IS_MF_AFEX(sc)) { /* * configure that AFEX and VLAN headers must be * sent in AFEX mode */ REG_WR(sc, PBF_REG_HDRS_AFTER_BASIC, 0xE); REG_WR(sc, PBF_REG_MUST_HAVE_HDRS, 0xA); REG_WR(sc, PBF_REG_HDRS_AFTER_TAG_0, 0x6); REG_WR(sc, PBF_REG_TAG_ETHERTYPE_0, 0x8926); REG_WR(sc, PBF_REG_TAG_LEN_0, 0x4); } else { REG_WR(sc, PBF_REG_HDRS_AFTER_BASIC, sc->devinfo.mf_info.path_has_ovlan ? 7 : 6); } } REG_WR(sc, SRC_REG_SOFT_RST, 1); ecore_init_block(sc, BLOCK_SRC, PHASE_COMMON); if (CNIC_SUPPORT(sc)) { REG_WR(sc, SRC_REG_KEYSEARCH_0, 0x63285672); REG_WR(sc, SRC_REG_KEYSEARCH_1, 0x24b8f2cc); REG_WR(sc, SRC_REG_KEYSEARCH_2, 0x223aef9b); REG_WR(sc, SRC_REG_KEYSEARCH_3, 0x26001e3a); REG_WR(sc, SRC_REG_KEYSEARCH_4, 0x7ae91116); REG_WR(sc, SRC_REG_KEYSEARCH_5, 0x5ce5230b); REG_WR(sc, SRC_REG_KEYSEARCH_6, 0x298d8adf); REG_WR(sc, SRC_REG_KEYSEARCH_7, 0x6eb0ff09); REG_WR(sc, SRC_REG_KEYSEARCH_8, 0x1830f82f); REG_WR(sc, SRC_REG_KEYSEARCH_9, 0x01e46be7); } REG_WR(sc, SRC_REG_SOFT_RST, 0); if (sizeof(union cdu_context) != 1024) { /* we currently assume that a context is 1024 bytes */ BLOGE(sc, "please adjust the size of cdu_context(%ld)\n", (long)sizeof(union cdu_context)); } ecore_init_block(sc, BLOCK_CDU, PHASE_COMMON); val = (4 << 24) + (0 << 12) + 1024; REG_WR(sc, CDU_REG_CDU_GLOBAL_PARAMS, val); ecore_init_block(sc, BLOCK_CFC, PHASE_COMMON); REG_WR(sc, CFC_REG_INIT_REG, 0x7FF); /* enable context validation interrupt from CFC */ REG_WR(sc, CFC_REG_CFC_INT_MASK, 0); /* set the thresholds to prevent CFC/CDU race */ REG_WR(sc, CFC_REG_DEBUG0, 0x20020000); ecore_init_block(sc, BLOCK_HC, PHASE_COMMON); if (!CHIP_IS_E1x(sc) && BXE_NOMCP(sc)) { REG_WR(sc, IGU_REG_RESET_MEMORIES, 0x36); } ecore_init_block(sc, BLOCK_IGU, PHASE_COMMON); ecore_init_block(sc, BLOCK_MISC_AEU, PHASE_COMMON); /* Reset PCIE errors for debug */ REG_WR(sc, 0x2814, 0xffffffff); REG_WR(sc, 0x3820, 0xffffffff); if (!CHIP_IS_E1x(sc)) { REG_WR(sc, PCICFG_OFFSET + PXPCS_TL_CONTROL_5, (PXPCS_TL_CONTROL_5_ERR_UNSPPORT1 | PXPCS_TL_CONTROL_5_ERR_UNSPPORT)); REG_WR(sc, PCICFG_OFFSET + PXPCS_TL_FUNC345_STAT, (PXPCS_TL_FUNC345_STAT_ERR_UNSPPORT4 | PXPCS_TL_FUNC345_STAT_ERR_UNSPPORT3 | PXPCS_TL_FUNC345_STAT_ERR_UNSPPORT2)); REG_WR(sc, PCICFG_OFFSET + PXPCS_TL_FUNC678_STAT, (PXPCS_TL_FUNC678_STAT_ERR_UNSPPORT7 | PXPCS_TL_FUNC678_STAT_ERR_UNSPPORT6 | PXPCS_TL_FUNC678_STAT_ERR_UNSPPORT5)); } ecore_init_block(sc, BLOCK_NIG, PHASE_COMMON); if (!CHIP_IS_E1(sc)) { /* in E3 this done in per-port section */ if (!CHIP_IS_E3(sc)) REG_WR(sc, NIG_REG_LLH_MF_MODE, IS_MF(sc)); } if (CHIP_IS_E1H(sc)) { /* not applicable for E2 (and above ...) */ REG_WR(sc, NIG_REG_LLH_E1HOV_MODE, IS_MF_SD(sc)); } if (CHIP_REV_IS_SLOW(sc)) { DELAY(200000); } /* finish CFC init */ val = reg_poll(sc, CFC_REG_LL_INIT_DONE, 1, 100, 10); if (val != 1) { BLOGE(sc, "CFC LL_INIT failed\n"); return (-1); } val = reg_poll(sc, CFC_REG_AC_INIT_DONE, 1, 100, 10); if (val != 1) { BLOGE(sc, "CFC AC_INIT failed\n"); return (-1); } val = reg_poll(sc, CFC_REG_CAM_INIT_DONE, 1, 100, 10); if (val != 1) { BLOGE(sc, "CFC CAM_INIT failed\n"); return (-1); } REG_WR(sc, CFC_REG_DEBUG0, 0); if (CHIP_IS_E1(sc)) { /* read NIG statistic to see if this is our first up since powerup */ bxe_read_dmae(sc, NIG_REG_STAT2_BRB_OCTET, 2); val = *BXE_SP(sc, wb_data[0]); /* do internal memory self test */ if ((val == 0) && bxe_int_mem_test(sc)) { BLOGE(sc, "internal mem self test failed\n"); return (-1); } } bxe_setup_fan_failure_detection(sc); /* clear PXP2 attentions */ REG_RD(sc, PXP2_REG_PXP2_INT_STS_CLR_0); bxe_enable_blocks_attention(sc); if (!CHIP_REV_IS_SLOW(sc)) { ecore_enable_blocks_parity(sc); } if (!BXE_NOMCP(sc)) { if (CHIP_IS_E1x(sc)) { bxe_common_init_phy(sc); } } return (0); } /** * bxe_init_hw_common_chip - init HW at the COMMON_CHIP phase. * * @sc: driver handle */ static int bxe_init_hw_common_chip(struct bxe_softc *sc) { int rc = bxe_init_hw_common(sc); if (rc) { return (rc); } /* In E2 2-PORT mode, same ext phy is used for the two paths */ if (!BXE_NOMCP(sc)) { bxe_common_init_phy(sc); } return (0); } static int bxe_init_hw_port(struct bxe_softc *sc) { int port = SC_PORT(sc); int init_phase = port ? PHASE_PORT1 : PHASE_PORT0; uint32_t low, high; uint32_t val; BLOGD(sc, DBG_LOAD, "starting port init for port %d\n", port); REG_WR(sc, NIG_REG_MASK_INTERRUPT_PORT0 + port*4, 0); ecore_init_block(sc, BLOCK_MISC, init_phase); ecore_init_block(sc, BLOCK_PXP, init_phase); ecore_init_block(sc, BLOCK_PXP2, init_phase); /* * Timers bug workaround: disables the pf_master bit in pglue at * common phase, we need to enable it here before any dmae access are * attempted. Therefore we manually added the enable-master to the * port phase (it also happens in the function phase) */ if (!CHIP_IS_E1x(sc)) { REG_WR(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, 1); } ecore_init_block(sc, BLOCK_ATC, init_phase); ecore_init_block(sc, BLOCK_DMAE, init_phase); ecore_init_block(sc, BLOCK_PGLUE_B, init_phase); ecore_init_block(sc, BLOCK_QM, init_phase); ecore_init_block(sc, BLOCK_TCM, init_phase); ecore_init_block(sc, BLOCK_UCM, init_phase); ecore_init_block(sc, BLOCK_CCM, init_phase); ecore_init_block(sc, BLOCK_XCM, init_phase); /* QM cid (connection) count */ ecore_qm_init_cid_count(sc, sc->qm_cid_count, INITOP_SET); if (CNIC_SUPPORT(sc)) { ecore_init_block(sc, BLOCK_TM, init_phase); REG_WR(sc, TM_REG_LIN0_SCAN_TIME + port*4, 20); REG_WR(sc, TM_REG_LIN0_MAX_ACTIVE_CID + port*4, 31); } ecore_init_block(sc, BLOCK_DORQ, init_phase); ecore_init_block(sc, BLOCK_BRB1, init_phase); if (CHIP_IS_E1(sc) || CHIP_IS_E1H(sc)) { if (IS_MF(sc)) { low = (BXE_ONE_PORT(sc) ? 160 : 246); } else if (sc->mtu > 4096) { if (BXE_ONE_PORT(sc)) { low = 160; } else { val = sc->mtu; /* (24*1024 + val*4)/256 */ low = (96 + (val / 64) + ((val % 64) ? 1 : 0)); } } else { low = (BXE_ONE_PORT(sc) ? 80 : 160); } high = (low + 56); /* 14*1024/256 */ REG_WR(sc, BRB1_REG_PAUSE_LOW_THRESHOLD_0 + port*4, low); REG_WR(sc, BRB1_REG_PAUSE_HIGH_THRESHOLD_0 + port*4, high); } if (CHIP_IS_MODE_4_PORT(sc)) { REG_WR(sc, SC_PORT(sc) ? BRB1_REG_MAC_GUARANTIED_1 : BRB1_REG_MAC_GUARANTIED_0, 40); } ecore_init_block(sc, BLOCK_PRS, init_phase); if (CHIP_IS_E3B0(sc)) { if (IS_MF_AFEX(sc)) { /* configure headers for AFEX mode */ REG_WR(sc, SC_PORT(sc) ? PRS_REG_HDRS_AFTER_BASIC_PORT_1 : PRS_REG_HDRS_AFTER_BASIC_PORT_0, 0xE); REG_WR(sc, SC_PORT(sc) ? PRS_REG_HDRS_AFTER_TAG_0_PORT_1 : PRS_REG_HDRS_AFTER_TAG_0_PORT_0, 0x6); REG_WR(sc, SC_PORT(sc) ? PRS_REG_MUST_HAVE_HDRS_PORT_1 : PRS_REG_MUST_HAVE_HDRS_PORT_0, 0xA); } else { /* Ovlan exists only if we are in multi-function + * switch-dependent mode, in switch-independent there * is no ovlan headers */ REG_WR(sc, SC_PORT(sc) ? PRS_REG_HDRS_AFTER_BASIC_PORT_1 : PRS_REG_HDRS_AFTER_BASIC_PORT_0, (sc->devinfo.mf_info.path_has_ovlan ? 7 : 6)); } } ecore_init_block(sc, BLOCK_TSDM, init_phase); ecore_init_block(sc, BLOCK_CSDM, init_phase); ecore_init_block(sc, BLOCK_USDM, init_phase); ecore_init_block(sc, BLOCK_XSDM, init_phase); ecore_init_block(sc, BLOCK_TSEM, init_phase); ecore_init_block(sc, BLOCK_USEM, init_phase); ecore_init_block(sc, BLOCK_CSEM, init_phase); ecore_init_block(sc, BLOCK_XSEM, init_phase); ecore_init_block(sc, BLOCK_UPB, init_phase); ecore_init_block(sc, BLOCK_XPB, init_phase); ecore_init_block(sc, BLOCK_PBF, init_phase); if (CHIP_IS_E1x(sc)) { /* configure PBF to work without PAUSE mtu 9000 */ REG_WR(sc, PBF_REG_P0_PAUSE_ENABLE + port*4, 0); /* update threshold */ REG_WR(sc, PBF_REG_P0_ARB_THRSH + port*4, (9040/16)); /* update init credit */ REG_WR(sc, PBF_REG_P0_INIT_CRD + port*4, (9040/16) + 553 - 22); /* probe changes */ REG_WR(sc, PBF_REG_INIT_P0 + port*4, 1); DELAY(50); REG_WR(sc, PBF_REG_INIT_P0 + port*4, 0); } if (CNIC_SUPPORT(sc)) { ecore_init_block(sc, BLOCK_SRC, init_phase); } ecore_init_block(sc, BLOCK_CDU, init_phase); ecore_init_block(sc, BLOCK_CFC, init_phase); if (CHIP_IS_E1(sc)) { REG_WR(sc, HC_REG_LEADING_EDGE_0 + port*8, 0); REG_WR(sc, HC_REG_TRAILING_EDGE_0 + port*8, 0); } ecore_init_block(sc, BLOCK_HC, init_phase); ecore_init_block(sc, BLOCK_IGU, init_phase); ecore_init_block(sc, BLOCK_MISC_AEU, init_phase); /* init aeu_mask_attn_func_0/1: * - SF mode: bits 3-7 are masked. only bits 0-2 are in use * - MF mode: bit 3 is masked. bits 0-2 are in use as in SF * bits 4-7 are used for "per vn group attention" */ val = IS_MF(sc) ? 0xF7 : 0x7; /* Enable DCBX attention for all but E1 */ val |= CHIP_IS_E1(sc) ? 0 : 0x10; REG_WR(sc, MISC_REG_AEU_MASK_ATTN_FUNC_0 + port*4, val); ecore_init_block(sc, BLOCK_NIG, init_phase); if (!CHIP_IS_E1x(sc)) { /* Bit-map indicating which L2 hdrs may appear after the * basic Ethernet header */ if (IS_MF_AFEX(sc)) { REG_WR(sc, SC_PORT(sc) ? NIG_REG_P1_HDRS_AFTER_BASIC : NIG_REG_P0_HDRS_AFTER_BASIC, 0xE); } else { REG_WR(sc, SC_PORT(sc) ? NIG_REG_P1_HDRS_AFTER_BASIC : NIG_REG_P0_HDRS_AFTER_BASIC, IS_MF_SD(sc) ? 7 : 6); } if (CHIP_IS_E3(sc)) { REG_WR(sc, SC_PORT(sc) ? NIG_REG_LLH1_MF_MODE : NIG_REG_LLH_MF_MODE, IS_MF(sc)); } } if (!CHIP_IS_E3(sc)) { REG_WR(sc, NIG_REG_XGXS_SERDES0_MODE_SEL + port*4, 1); } if (!CHIP_IS_E1(sc)) { /* 0x2 disable mf_ov, 0x1 enable */ REG_WR(sc, NIG_REG_LLH0_BRB1_DRV_MASK_MF + port*4, (IS_MF_SD(sc) ? 0x1 : 0x2)); if (!CHIP_IS_E1x(sc)) { val = 0; switch (sc->devinfo.mf_info.mf_mode) { case MULTI_FUNCTION_SD: val = 1; break; case MULTI_FUNCTION_SI: case MULTI_FUNCTION_AFEX: val = 2; break; } REG_WR(sc, (SC_PORT(sc) ? NIG_REG_LLH1_CLS_TYPE : NIG_REG_LLH0_CLS_TYPE), val); } REG_WR(sc, NIG_REG_LLFC_ENABLE_0 + port*4, 0); REG_WR(sc, NIG_REG_LLFC_OUT_EN_0 + port*4, 0); REG_WR(sc, NIG_REG_PAUSE_ENABLE_0 + port*4, 1); } /* If SPIO5 is set to generate interrupts, enable it for this port */ val = REG_RD(sc, MISC_REG_SPIO_EVENT_EN); if (val & MISC_SPIO_SPIO5) { uint32_t reg_addr = (port ? MISC_REG_AEU_ENABLE1_FUNC_1_OUT_0 : MISC_REG_AEU_ENABLE1_FUNC_0_OUT_0); val = REG_RD(sc, reg_addr); val |= AEU_INPUTS_ATTN_BITS_SPIO5; REG_WR(sc, reg_addr, val); } return (0); } static uint32_t bxe_flr_clnup_reg_poll(struct bxe_softc *sc, uint32_t reg, uint32_t expected, uint32_t poll_count) { uint32_t cur_cnt = poll_count; uint32_t val; while ((val = REG_RD(sc, reg)) != expected && cur_cnt--) { DELAY(FLR_WAIT_INTERVAL); } return (val); } static int bxe_flr_clnup_poll_hw_counter(struct bxe_softc *sc, uint32_t reg, char *msg, uint32_t poll_cnt) { uint32_t val = bxe_flr_clnup_reg_poll(sc, reg, 0, poll_cnt); if (val != 0) { BLOGE(sc, "%s usage count=%d\n", msg, val); return (1); } return (0); } /* Common routines with VF FLR cleanup */ static uint32_t bxe_flr_clnup_poll_count(struct bxe_softc *sc) { /* adjust polling timeout */ if (CHIP_REV_IS_EMUL(sc)) { return (FLR_POLL_CNT * 2000); } if (CHIP_REV_IS_FPGA(sc)) { return (FLR_POLL_CNT * 120); } return (FLR_POLL_CNT); } static int bxe_poll_hw_usage_counters(struct bxe_softc *sc, uint32_t poll_cnt) { /* wait for CFC PF usage-counter to zero (includes all the VFs) */ if (bxe_flr_clnup_poll_hw_counter(sc, CFC_REG_NUM_LCIDS_INSIDE_PF, "CFC PF usage counter timed out", poll_cnt)) { return (1); } /* Wait for DQ PF usage-counter to zero (until DQ cleanup) */ if (bxe_flr_clnup_poll_hw_counter(sc, DORQ_REG_PF_USAGE_CNT, "DQ PF usage counter timed out", poll_cnt)) { return (1); } /* Wait for QM PF usage-counter to zero (until DQ cleanup) */ if (bxe_flr_clnup_poll_hw_counter(sc, QM_REG_PF_USG_CNT_0 + 4*SC_FUNC(sc), "QM PF usage counter timed out", poll_cnt)) { return (1); } /* Wait for Timer PF usage-counters to zero (until DQ cleanup) */ if (bxe_flr_clnup_poll_hw_counter(sc, TM_REG_LIN0_VNIC_UC + 4*SC_PORT(sc), "Timers VNIC usage counter timed out", poll_cnt)) { return (1); } if (bxe_flr_clnup_poll_hw_counter(sc, TM_REG_LIN0_NUM_SCANS + 4*SC_PORT(sc), "Timers NUM_SCANS usage counter timed out", poll_cnt)) { return (1); } /* Wait DMAE PF usage counter to zero */ if (bxe_flr_clnup_poll_hw_counter(sc, dmae_reg_go_c[INIT_DMAE_C(sc)], "DMAE dommand register timed out", poll_cnt)) { return (1); } return (0); } #define OP_GEN_PARAM(param) \ (((param) << SDM_OP_GEN_COMP_PARAM_SHIFT) & SDM_OP_GEN_COMP_PARAM) #define OP_GEN_TYPE(type) \ (((type) << SDM_OP_GEN_COMP_TYPE_SHIFT) & SDM_OP_GEN_COMP_TYPE) #define OP_GEN_AGG_VECT(index) \ (((index) << SDM_OP_GEN_AGG_VECT_IDX_SHIFT) & SDM_OP_GEN_AGG_VECT_IDX) static int bxe_send_final_clnup(struct bxe_softc *sc, uint8_t clnup_func, uint32_t poll_cnt) { uint32_t op_gen_command = 0; uint32_t comp_addr = (BAR_CSTRORM_INTMEM + CSTORM_FINAL_CLEANUP_COMPLETE_OFFSET(clnup_func)); int ret = 0; if (REG_RD(sc, comp_addr)) { BLOGE(sc, "Cleanup complete was not 0 before sending\n"); return (1); } op_gen_command |= OP_GEN_PARAM(XSTORM_AGG_INT_FINAL_CLEANUP_INDEX); op_gen_command |= OP_GEN_TYPE(XSTORM_AGG_INT_FINAL_CLEANUP_COMP_TYPE); op_gen_command |= OP_GEN_AGG_VECT(clnup_func); op_gen_command |= 1 << SDM_OP_GEN_AGG_VECT_IDX_VALID_SHIFT; BLOGD(sc, DBG_LOAD, "sending FW Final cleanup\n"); REG_WR(sc, XSDM_REG_OPERATION_GEN, op_gen_command); if (bxe_flr_clnup_reg_poll(sc, comp_addr, 1, poll_cnt) != 1) { BLOGE(sc, "FW final cleanup did not succeed\n"); BLOGD(sc, DBG_LOAD, "At timeout completion address contained %x\n", (REG_RD(sc, comp_addr))); bxe_panic(sc, ("FLR cleanup failed\n")); return (1); } /* Zero completion for nxt FLR */ REG_WR(sc, comp_addr, 0); return (ret); } static void bxe_pbf_pN_buf_flushed(struct bxe_softc *sc, struct pbf_pN_buf_regs *regs, uint32_t poll_count) { uint32_t init_crd, crd, crd_start, crd_freed, crd_freed_start; uint32_t cur_cnt = poll_count; crd_freed = crd_freed_start = REG_RD(sc, regs->crd_freed); crd = crd_start = REG_RD(sc, regs->crd); init_crd = REG_RD(sc, regs->init_crd); BLOGD(sc, DBG_LOAD, "INIT CREDIT[%d] : %x\n", regs->pN, init_crd); BLOGD(sc, DBG_LOAD, "CREDIT[%d] : s:%x\n", regs->pN, crd); BLOGD(sc, DBG_LOAD, "CREDIT_FREED[%d]: s:%x\n", regs->pN, crd_freed); while ((crd != init_crd) && ((uint32_t)((int32_t)crd_freed - (int32_t)crd_freed_start) < (init_crd - crd_start))) { if (cur_cnt--) { DELAY(FLR_WAIT_INTERVAL); crd = REG_RD(sc, regs->crd); crd_freed = REG_RD(sc, regs->crd_freed); } else { BLOGD(sc, DBG_LOAD, "PBF tx buffer[%d] timed out\n", regs->pN); BLOGD(sc, DBG_LOAD, "CREDIT[%d] : c:%x\n", regs->pN, crd); BLOGD(sc, DBG_LOAD, "CREDIT_FREED[%d]: c:%x\n", regs->pN, crd_freed); break; } } BLOGD(sc, DBG_LOAD, "Waited %d*%d usec for PBF tx buffer[%d]\n", poll_count-cur_cnt, FLR_WAIT_INTERVAL, regs->pN); } static void bxe_pbf_pN_cmd_flushed(struct bxe_softc *sc, struct pbf_pN_cmd_regs *regs, uint32_t poll_count) { uint32_t occup, to_free, freed, freed_start; uint32_t cur_cnt = poll_count; occup = to_free = REG_RD(sc, regs->lines_occup); freed = freed_start = REG_RD(sc, regs->lines_freed); BLOGD(sc, DBG_LOAD, "OCCUPANCY[%d] : s:%x\n", regs->pN, occup); BLOGD(sc, DBG_LOAD, "LINES_FREED[%d] : s:%x\n", regs->pN, freed); while (occup && ((uint32_t)((int32_t)freed - (int32_t)freed_start) < to_free)) { if (cur_cnt--) { DELAY(FLR_WAIT_INTERVAL); occup = REG_RD(sc, regs->lines_occup); freed = REG_RD(sc, regs->lines_freed); } else { BLOGD(sc, DBG_LOAD, "PBF cmd queue[%d] timed out\n", regs->pN); BLOGD(sc, DBG_LOAD, "OCCUPANCY[%d] : s:%x\n", regs->pN, occup); BLOGD(sc, DBG_LOAD, "LINES_FREED[%d] : s:%x\n", regs->pN, freed); break; } } BLOGD(sc, DBG_LOAD, "Waited %d*%d usec for PBF cmd queue[%d]\n", poll_count - cur_cnt, FLR_WAIT_INTERVAL, regs->pN); } static void bxe_tx_hw_flushed(struct bxe_softc *sc, uint32_t poll_count) { struct pbf_pN_cmd_regs cmd_regs[] = { {0, (CHIP_IS_E3B0(sc)) ? PBF_REG_TQ_OCCUPANCY_Q0 : PBF_REG_P0_TQ_OCCUPANCY, (CHIP_IS_E3B0(sc)) ? PBF_REG_TQ_LINES_FREED_CNT_Q0 : PBF_REG_P0_TQ_LINES_FREED_CNT}, {1, (CHIP_IS_E3B0(sc)) ? PBF_REG_TQ_OCCUPANCY_Q1 : PBF_REG_P1_TQ_OCCUPANCY, (CHIP_IS_E3B0(sc)) ? PBF_REG_TQ_LINES_FREED_CNT_Q1 : PBF_REG_P1_TQ_LINES_FREED_CNT}, {4, (CHIP_IS_E3B0(sc)) ? PBF_REG_TQ_OCCUPANCY_LB_Q : PBF_REG_P4_TQ_OCCUPANCY, (CHIP_IS_E3B0(sc)) ? PBF_REG_TQ_LINES_FREED_CNT_LB_Q : PBF_REG_P4_TQ_LINES_FREED_CNT} }; struct pbf_pN_buf_regs buf_regs[] = { {0, (CHIP_IS_E3B0(sc)) ? PBF_REG_INIT_CRD_Q0 : PBF_REG_P0_INIT_CRD , (CHIP_IS_E3B0(sc)) ? PBF_REG_CREDIT_Q0 : PBF_REG_P0_CREDIT, (CHIP_IS_E3B0(sc)) ? PBF_REG_INTERNAL_CRD_FREED_CNT_Q0 : PBF_REG_P0_INTERNAL_CRD_FREED_CNT}, {1, (CHIP_IS_E3B0(sc)) ? PBF_REG_INIT_CRD_Q1 : PBF_REG_P1_INIT_CRD, (CHIP_IS_E3B0(sc)) ? PBF_REG_CREDIT_Q1 : PBF_REG_P1_CREDIT, (CHIP_IS_E3B0(sc)) ? PBF_REG_INTERNAL_CRD_FREED_CNT_Q1 : PBF_REG_P1_INTERNAL_CRD_FREED_CNT}, {4, (CHIP_IS_E3B0(sc)) ? PBF_REG_INIT_CRD_LB_Q : PBF_REG_P4_INIT_CRD, (CHIP_IS_E3B0(sc)) ? PBF_REG_CREDIT_LB_Q : PBF_REG_P4_CREDIT, (CHIP_IS_E3B0(sc)) ? PBF_REG_INTERNAL_CRD_FREED_CNT_LB_Q : PBF_REG_P4_INTERNAL_CRD_FREED_CNT}, }; int i; /* Verify the command queues are flushed P0, P1, P4 */ for (i = 0; i < ARRAY_SIZE(cmd_regs); i++) { bxe_pbf_pN_cmd_flushed(sc, &cmd_regs[i], poll_count); } /* Verify the transmission buffers are flushed P0, P1, P4 */ for (i = 0; i < ARRAY_SIZE(buf_regs); i++) { bxe_pbf_pN_buf_flushed(sc, &buf_regs[i], poll_count); } } static void bxe_hw_enable_status(struct bxe_softc *sc) { uint32_t val; val = REG_RD(sc, CFC_REG_WEAK_ENABLE_PF); BLOGD(sc, DBG_LOAD, "CFC_REG_WEAK_ENABLE_PF is 0x%x\n", val); val = REG_RD(sc, PBF_REG_DISABLE_PF); BLOGD(sc, DBG_LOAD, "PBF_REG_DISABLE_PF is 0x%x\n", val); val = REG_RD(sc, IGU_REG_PCI_PF_MSI_EN); BLOGD(sc, DBG_LOAD, "IGU_REG_PCI_PF_MSI_EN is 0x%x\n", val); val = REG_RD(sc, IGU_REG_PCI_PF_MSIX_EN); BLOGD(sc, DBG_LOAD, "IGU_REG_PCI_PF_MSIX_EN is 0x%x\n", val); val = REG_RD(sc, IGU_REG_PCI_PF_MSIX_FUNC_MASK); BLOGD(sc, DBG_LOAD, "IGU_REG_PCI_PF_MSIX_FUNC_MASK is 0x%x\n", val); val = REG_RD(sc, PGLUE_B_REG_SHADOW_BME_PF_7_0_CLR); BLOGD(sc, DBG_LOAD, "PGLUE_B_REG_SHADOW_BME_PF_7_0_CLR is 0x%x\n", val); val = REG_RD(sc, PGLUE_B_REG_FLR_REQUEST_PF_7_0_CLR); BLOGD(sc, DBG_LOAD, "PGLUE_B_REG_FLR_REQUEST_PF_7_0_CLR is 0x%x\n", val); val = REG_RD(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER); BLOGD(sc, DBG_LOAD, "PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER is 0x%x\n", val); } static int bxe_pf_flr_clnup(struct bxe_softc *sc) { uint32_t poll_cnt = bxe_flr_clnup_poll_count(sc); BLOGD(sc, DBG_LOAD, "Cleanup after FLR PF[%d]\n", SC_ABS_FUNC(sc)); /* Re-enable PF target read access */ REG_WR(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_TARGET_READ, 1); /* Poll HW usage counters */ BLOGD(sc, DBG_LOAD, "Polling usage counters\n"); if (bxe_poll_hw_usage_counters(sc, poll_cnt)) { return (-1); } /* Zero the igu 'trailing edge' and 'leading edge' */ /* Send the FW cleanup command */ if (bxe_send_final_clnup(sc, (uint8_t)SC_FUNC(sc), poll_cnt)) { return (-1); } /* ATC cleanup */ /* Verify TX hw is flushed */ bxe_tx_hw_flushed(sc, poll_cnt); /* Wait 100ms (not adjusted according to platform) */ DELAY(100000); /* Verify no pending pci transactions */ if (bxe_is_pcie_pending(sc)) { BLOGE(sc, "PCIE Transactions still pending\n"); } /* Debug */ bxe_hw_enable_status(sc); /* * Master enable - Due to WB DMAE writes performed before this * register is re-initialized as part of the regular function init */ REG_WR(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, 1); return (0); } #if 0 static void bxe_init_searcher(struct bxe_softc *sc) { int port = SC_PORT(sc); ecore_src_init_t2(sc, sc->t2, sc->t2_mapping, SRC_CONN_NUM); /* T1 hash bits value determines the T1 number of entries */ REG_WR(sc, SRC_REG_NUMBER_HASH_BITS0 + port*4, SRC_HASH_BITS); } #endif static int bxe_init_hw_func(struct bxe_softc *sc) { int port = SC_PORT(sc); int func = SC_FUNC(sc); int init_phase = PHASE_PF0 + func; struct ecore_ilt *ilt = sc->ilt; uint16_t cdu_ilt_start; uint32_t addr, val; uint32_t main_mem_base, main_mem_size, main_mem_prty_clr; int i, main_mem_width, rc; BLOGD(sc, DBG_LOAD, "starting func init for func %d\n", func); /* FLR cleanup */ if (!CHIP_IS_E1x(sc)) { rc = bxe_pf_flr_clnup(sc); if (rc) { BLOGE(sc, "FLR cleanup failed!\n"); // XXX bxe_fw_dump(sc); // XXX bxe_idle_chk(sc); return (rc); } } /* set MSI reconfigure capability */ if (sc->devinfo.int_block == INT_BLOCK_HC) { addr = (port ? HC_REG_CONFIG_1 : HC_REG_CONFIG_0); val = REG_RD(sc, addr); val |= HC_CONFIG_0_REG_MSI_ATTN_EN_0; REG_WR(sc, addr, val); } ecore_init_block(sc, BLOCK_PXP, init_phase); ecore_init_block(sc, BLOCK_PXP2, init_phase); ilt = sc->ilt; cdu_ilt_start = ilt->clients[ILT_CLIENT_CDU].start; #if 0 if (IS_SRIOV(sc)) { cdu_ilt_start += BXE_FIRST_VF_CID/ILT_PAGE_CIDS; } cdu_ilt_start = bxe_iov_init_ilt(sc, cdu_ilt_start); #if (BXE_FIRST_VF_CID > 0) /* * If BXE_FIRST_VF_CID > 0 then the PF L2 cids precedes * those of the VFs, so start line should be reset */ cdu_ilt_start = ilt->clients[ILT_CLIENT_CDU].start; #endif #endif for (i = 0; i < L2_ILT_LINES(sc); i++) { ilt->lines[cdu_ilt_start + i].page = sc->context[i].vcxt; ilt->lines[cdu_ilt_start + i].page_mapping = sc->context[i].vcxt_dma.paddr; ilt->lines[cdu_ilt_start + i].size = sc->context[i].size; } ecore_ilt_init_op(sc, INITOP_SET); #if 0 if (!CONFIGURE_NIC_MODE(sc)) { bxe_init_searcher(sc); REG_WR(sc, PRS_REG_NIC_MODE, 0); BLOGD(sc, DBG_LOAD, "NIC MODE disabled\n"); } else #endif { /* Set NIC mode */ REG_WR(sc, PRS_REG_NIC_MODE, 1); BLOGD(sc, DBG_LOAD, "NIC MODE configured\n"); } if (!CHIP_IS_E1x(sc)) { uint32_t pf_conf = IGU_PF_CONF_FUNC_EN; /* Turn on a single ISR mode in IGU if driver is going to use * INT#x or MSI */ if (sc->interrupt_mode != INTR_MODE_MSIX) { pf_conf |= IGU_PF_CONF_SINGLE_ISR_EN; } /* * Timers workaround bug: function init part. * Need to wait 20msec after initializing ILT, * needed to make sure there are no requests in * one of the PXP internal queues with "old" ILT addresses */ DELAY(20000); /* * Master enable - Due to WB DMAE writes performed before this * register is re-initialized as part of the regular function * init */ REG_WR(sc, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, 1); /* Enable the function in IGU */ REG_WR(sc, IGU_REG_PF_CONFIGURATION, pf_conf); } sc->dmae_ready = 1; ecore_init_block(sc, BLOCK_PGLUE_B, init_phase); if (!CHIP_IS_E1x(sc)) REG_WR(sc, PGLUE_B_REG_WAS_ERROR_PF_7_0_CLR, func); ecore_init_block(sc, BLOCK_ATC, init_phase); ecore_init_block(sc, BLOCK_DMAE, init_phase); ecore_init_block(sc, BLOCK_NIG, init_phase); ecore_init_block(sc, BLOCK_SRC, init_phase); ecore_init_block(sc, BLOCK_MISC, init_phase); ecore_init_block(sc, BLOCK_TCM, init_phase); ecore_init_block(sc, BLOCK_UCM, init_phase); ecore_init_block(sc, BLOCK_CCM, init_phase); ecore_init_block(sc, BLOCK_XCM, init_phase); ecore_init_block(sc, BLOCK_TSEM, init_phase); ecore_init_block(sc, BLOCK_USEM, init_phase); ecore_init_block(sc, BLOCK_CSEM, init_phase); ecore_init_block(sc, BLOCK_XSEM, init_phase); if (!CHIP_IS_E1x(sc)) REG_WR(sc, QM_REG_PF_EN, 1); if (!CHIP_IS_E1x(sc)) { REG_WR(sc, TSEM_REG_VFPF_ERR_NUM, BXE_MAX_NUM_OF_VFS + func); REG_WR(sc, USEM_REG_VFPF_ERR_NUM, BXE_MAX_NUM_OF_VFS + func); REG_WR(sc, CSEM_REG_VFPF_ERR_NUM, BXE_MAX_NUM_OF_VFS + func); REG_WR(sc, XSEM_REG_VFPF_ERR_NUM, BXE_MAX_NUM_OF_VFS + func); } ecore_init_block(sc, BLOCK_QM, init_phase); ecore_init_block(sc, BLOCK_TM, init_phase); ecore_init_block(sc, BLOCK_DORQ, init_phase); bxe_iov_init_dq(sc); ecore_init_block(sc, BLOCK_BRB1, init_phase); ecore_init_block(sc, BLOCK_PRS, init_phase); ecore_init_block(sc, BLOCK_TSDM, init_phase); ecore_init_block(sc, BLOCK_CSDM, init_phase); ecore_init_block(sc, BLOCK_USDM, init_phase); ecore_init_block(sc, BLOCK_XSDM, init_phase); ecore_init_block(sc, BLOCK_UPB, init_phase); ecore_init_block(sc, BLOCK_XPB, init_phase); ecore_init_block(sc, BLOCK_PBF, init_phase); if (!CHIP_IS_E1x(sc)) REG_WR(sc, PBF_REG_DISABLE_PF, 0); ecore_init_block(sc, BLOCK_CDU, init_phase); ecore_init_block(sc, BLOCK_CFC, init_phase); if (!CHIP_IS_E1x(sc)) REG_WR(sc, CFC_REG_WEAK_ENABLE_PF, 1); if (IS_MF(sc)) { REG_WR(sc, NIG_REG_LLH0_FUNC_EN + port*8, 1); REG_WR(sc, NIG_REG_LLH0_FUNC_VLAN_ID + port*8, OVLAN(sc)); } ecore_init_block(sc, BLOCK_MISC_AEU, init_phase); /* HC init per function */ if (sc->devinfo.int_block == INT_BLOCK_HC) { if (CHIP_IS_E1H(sc)) { REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_12 + func*4, 0); REG_WR(sc, HC_REG_LEADING_EDGE_0 + port*8, 0); REG_WR(sc, HC_REG_TRAILING_EDGE_0 + port*8, 0); } ecore_init_block(sc, BLOCK_HC, init_phase); } else { int num_segs, sb_idx, prod_offset; REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_12 + func*4, 0); if (!CHIP_IS_E1x(sc)) { REG_WR(sc, IGU_REG_LEADING_EDGE_LATCH, 0); REG_WR(sc, IGU_REG_TRAILING_EDGE_LATCH, 0); } ecore_init_block(sc, BLOCK_IGU, init_phase); if (!CHIP_IS_E1x(sc)) { int dsb_idx = 0; /** * Producer memory: * E2 mode: address 0-135 match to the mapping memory; * 136 - PF0 default prod; 137 - PF1 default prod; * 138 - PF2 default prod; 139 - PF3 default prod; * 140 - PF0 attn prod; 141 - PF1 attn prod; * 142 - PF2 attn prod; 143 - PF3 attn prod; * 144-147 reserved. * * E1.5 mode - In backward compatible mode; * for non default SB; each even line in the memory * holds the U producer and each odd line hold * the C producer. The first 128 producers are for * NDSB (PF0 - 0-31; PF1 - 32-63 and so on). The last 20 * producers are for the DSB for each PF. * Each PF has five segments: (the order inside each * segment is PF0; PF1; PF2; PF3) - 128-131 U prods; * 132-135 C prods; 136-139 X prods; 140-143 T prods; * 144-147 attn prods; */ /* non-default-status-blocks */ num_segs = CHIP_INT_MODE_IS_BC(sc) ? IGU_BC_NDSB_NUM_SEGS : IGU_NORM_NDSB_NUM_SEGS; for (sb_idx = 0; sb_idx < sc->igu_sb_cnt; sb_idx++) { prod_offset = (sc->igu_base_sb + sb_idx) * num_segs; for (i = 0; i < num_segs; i++) { addr = IGU_REG_PROD_CONS_MEMORY + (prod_offset + i) * 4; REG_WR(sc, addr, 0); } /* send consumer update with value 0 */ bxe_ack_sb(sc, sc->igu_base_sb + sb_idx, USTORM_ID, 0, IGU_INT_NOP, 1); bxe_igu_clear_sb(sc, sc->igu_base_sb + sb_idx); } /* default-status-blocks */ num_segs = CHIP_INT_MODE_IS_BC(sc) ? IGU_BC_DSB_NUM_SEGS : IGU_NORM_DSB_NUM_SEGS; if (CHIP_IS_MODE_4_PORT(sc)) dsb_idx = SC_FUNC(sc); else dsb_idx = SC_VN(sc); prod_offset = (CHIP_INT_MODE_IS_BC(sc) ? IGU_BC_BASE_DSB_PROD + dsb_idx : IGU_NORM_BASE_DSB_PROD + dsb_idx); /* * igu prods come in chunks of E1HVN_MAX (4) - * does not matters what is the current chip mode */ for (i = 0; i < (num_segs * E1HVN_MAX); i += E1HVN_MAX) { addr = IGU_REG_PROD_CONS_MEMORY + (prod_offset + i)*4; REG_WR(sc, addr, 0); } /* send consumer update with 0 */ if (CHIP_INT_MODE_IS_BC(sc)) { bxe_ack_sb(sc, sc->igu_dsb_id, USTORM_ID, 0, IGU_INT_NOP, 1); bxe_ack_sb(sc, sc->igu_dsb_id, CSTORM_ID, 0, IGU_INT_NOP, 1); bxe_ack_sb(sc, sc->igu_dsb_id, XSTORM_ID, 0, IGU_INT_NOP, 1); bxe_ack_sb(sc, sc->igu_dsb_id, TSTORM_ID, 0, IGU_INT_NOP, 1); bxe_ack_sb(sc, sc->igu_dsb_id, ATTENTION_ID, 0, IGU_INT_NOP, 1); } else { bxe_ack_sb(sc, sc->igu_dsb_id, USTORM_ID, 0, IGU_INT_NOP, 1); bxe_ack_sb(sc, sc->igu_dsb_id, ATTENTION_ID, 0, IGU_INT_NOP, 1); } bxe_igu_clear_sb(sc, sc->igu_dsb_id); /* !!! these should become driver const once rf-tool supports split-68 const */ REG_WR(sc, IGU_REG_SB_INT_BEFORE_MASK_LSB, 0); REG_WR(sc, IGU_REG_SB_INT_BEFORE_MASK_MSB, 0); REG_WR(sc, IGU_REG_SB_MASK_LSB, 0); REG_WR(sc, IGU_REG_SB_MASK_MSB, 0); REG_WR(sc, IGU_REG_PBA_STATUS_LSB, 0); REG_WR(sc, IGU_REG_PBA_STATUS_MSB, 0); } } /* Reset PCIE errors for debug */ REG_WR(sc, 0x2114, 0xffffffff); REG_WR(sc, 0x2120, 0xffffffff); if (CHIP_IS_E1x(sc)) { main_mem_size = HC_REG_MAIN_MEMORY_SIZE / 2; /*dwords*/ main_mem_base = HC_REG_MAIN_MEMORY + SC_PORT(sc) * (main_mem_size * 4); main_mem_prty_clr = HC_REG_HC_PRTY_STS_CLR; main_mem_width = 8; val = REG_RD(sc, main_mem_prty_clr); if (val) { BLOGD(sc, DBG_LOAD, "Parity errors in HC block during function init (0x%x)!\n", val); } /* Clear "false" parity errors in MSI-X table */ for (i = main_mem_base; i < main_mem_base + main_mem_size * 4; i += main_mem_width) { bxe_read_dmae(sc, i, main_mem_width / 4); bxe_write_dmae(sc, BXE_SP_MAPPING(sc, wb_data), i, main_mem_width / 4); } /* Clear HC parity attention */ REG_RD(sc, main_mem_prty_clr); } #if 1 /* Enable STORMs SP logging */ REG_WR8(sc, BAR_USTRORM_INTMEM + USTORM_RECORD_SLOW_PATH_OFFSET(SC_FUNC(sc)), 1); REG_WR8(sc, BAR_TSTRORM_INTMEM + TSTORM_RECORD_SLOW_PATH_OFFSET(SC_FUNC(sc)), 1); REG_WR8(sc, BAR_CSTRORM_INTMEM + CSTORM_RECORD_SLOW_PATH_OFFSET(SC_FUNC(sc)), 1); REG_WR8(sc, BAR_XSTRORM_INTMEM + XSTORM_RECORD_SLOW_PATH_OFFSET(SC_FUNC(sc)), 1); #endif elink_phy_probe(&sc->link_params); return (0); } static void bxe_link_reset(struct bxe_softc *sc) { if (!BXE_NOMCP(sc)) { BXE_PHY_LOCK(sc); elink_lfa_reset(&sc->link_params, &sc->link_vars); BXE_PHY_UNLOCK(sc); } else { if (!CHIP_REV_IS_SLOW(sc)) { BLOGW(sc, "Bootcode is missing - cannot reset link\n"); } } } static void bxe_reset_port(struct bxe_softc *sc) { int port = SC_PORT(sc); uint32_t val; /* reset physical Link */ bxe_link_reset(sc); REG_WR(sc, NIG_REG_MASK_INTERRUPT_PORT0 + port*4, 0); /* Do not rcv packets to BRB */ REG_WR(sc, NIG_REG_LLH0_BRB1_DRV_MASK + port*4, 0x0); /* Do not direct rcv packets that are not for MCP to the BRB */ REG_WR(sc, (port ? NIG_REG_LLH1_BRB1_NOT_MCP : NIG_REG_LLH0_BRB1_NOT_MCP), 0x0); /* Configure AEU */ REG_WR(sc, MISC_REG_AEU_MASK_ATTN_FUNC_0 + port*4, 0); DELAY(100000); /* Check for BRB port occupancy */ val = REG_RD(sc, BRB1_REG_PORT_NUM_OCC_BLOCKS_0 + port*4); if (val) { BLOGD(sc, DBG_LOAD, "BRB1 is not empty, %d blocks are occupied\n", val); } /* TODO: Close Doorbell port? */ } static void bxe_ilt_wr(struct bxe_softc *sc, uint32_t index, bus_addr_t addr) { int reg; uint32_t wb_write[2]; if (CHIP_IS_E1(sc)) { reg = PXP2_REG_RQ_ONCHIP_AT + index*8; } else { reg = PXP2_REG_RQ_ONCHIP_AT_B0 + index*8; } wb_write[0] = ONCHIP_ADDR1(addr); wb_write[1] = ONCHIP_ADDR2(addr); REG_WR_DMAE(sc, reg, wb_write, 2); } static void bxe_clear_func_ilt(struct bxe_softc *sc, uint32_t func) { uint32_t i, base = FUNC_ILT_BASE(func); for (i = base; i < base + ILT_PER_FUNC; i++) { bxe_ilt_wr(sc, i, 0); } } static void bxe_reset_func(struct bxe_softc *sc) { struct bxe_fastpath *fp; int port = SC_PORT(sc); int func = SC_FUNC(sc); int i; /* Disable the function in the FW */ REG_WR8(sc, BAR_XSTRORM_INTMEM + XSTORM_FUNC_EN_OFFSET(func), 0); REG_WR8(sc, BAR_CSTRORM_INTMEM + CSTORM_FUNC_EN_OFFSET(func), 0); REG_WR8(sc, BAR_TSTRORM_INTMEM + TSTORM_FUNC_EN_OFFSET(func), 0); REG_WR8(sc, BAR_USTRORM_INTMEM + USTORM_FUNC_EN_OFFSET(func), 0); /* FP SBs */ FOR_EACH_ETH_QUEUE(sc, i) { fp = &sc->fp[i]; REG_WR8(sc, BAR_CSTRORM_INTMEM + CSTORM_STATUS_BLOCK_DATA_STATE_OFFSET(fp->fw_sb_id), SB_DISABLED); } #if 0 if (CNIC_LOADED(sc)) { /* CNIC SB */ REG_WR8(sc, BAR_CSTRORM_INTMEM + CSTORM_STATUS_BLOCK_DATA_STATE_OFFSET (bxe_cnic_fw_sb_id(sc)), SB_DISABLED); } #endif /* SP SB */ REG_WR8(sc, BAR_CSTRORM_INTMEM + CSTORM_SP_STATUS_BLOCK_DATA_STATE_OFFSET(func), SB_DISABLED); for (i = 0; i < XSTORM_SPQ_DATA_SIZE / 4; i++) { REG_WR(sc, BAR_XSTRORM_INTMEM + XSTORM_SPQ_DATA_OFFSET(func), 0); } /* Configure IGU */ if (sc->devinfo.int_block == INT_BLOCK_HC) { REG_WR(sc, HC_REG_LEADING_EDGE_0 + port*8, 0); REG_WR(sc, HC_REG_TRAILING_EDGE_0 + port*8, 0); } else { REG_WR(sc, IGU_REG_LEADING_EDGE_LATCH, 0); REG_WR(sc, IGU_REG_TRAILING_EDGE_LATCH, 0); } if (CNIC_LOADED(sc)) { /* Disable Timer scan */ REG_WR(sc, TM_REG_EN_LINEAR0_TIMER + port*4, 0); /* * Wait for at least 10ms and up to 2 second for the timers * scan to complete */ for (i = 0; i < 200; i++) { DELAY(10000); if (!REG_RD(sc, TM_REG_LIN0_SCAN_ON + port*4)) break; } } /* Clear ILT */ bxe_clear_func_ilt(sc, func); /* * Timers workaround bug for E2: if this is vnic-3, * we need to set the entire ilt range for this timers. */ if (!CHIP_IS_E1x(sc) && SC_VN(sc) == 3) { struct ilt_client_info ilt_cli; /* use dummy TM client */ memset(&ilt_cli, 0, sizeof(struct ilt_client_info)); ilt_cli.start = 0; ilt_cli.end = ILT_NUM_PAGE_ENTRIES - 1; ilt_cli.client_num = ILT_CLIENT_TM; ecore_ilt_boundry_init_op(sc, &ilt_cli, 0, INITOP_CLEAR); } /* this assumes that reset_port() called before reset_func()*/ if (!CHIP_IS_E1x(sc)) { bxe_pf_disable(sc); } sc->dmae_ready = 0; } static int bxe_gunzip_init(struct bxe_softc *sc) { return (0); } static void bxe_gunzip_end(struct bxe_softc *sc) { return; } static int bxe_init_firmware(struct bxe_softc *sc) { if (CHIP_IS_E1(sc)) { ecore_init_e1_firmware(sc); sc->iro_array = e1_iro_arr; } else if (CHIP_IS_E1H(sc)) { ecore_init_e1h_firmware(sc); sc->iro_array = e1h_iro_arr; } else if (!CHIP_IS_E1x(sc)) { ecore_init_e2_firmware(sc); sc->iro_array = e2_iro_arr; } else { BLOGE(sc, "Unsupported chip revision\n"); return (-1); } return (0); } static void bxe_release_firmware(struct bxe_softc *sc) { /* Do nothing */ return; } static int ecore_gunzip(struct bxe_softc *sc, const uint8_t *zbuf, int len) { /* XXX : Implement... */ BLOGD(sc, DBG_LOAD, "ECORE_GUNZIP NOT IMPLEMENTED\n"); return (FALSE); } static void ecore_reg_wr_ind(struct bxe_softc *sc, uint32_t addr, uint32_t val) { bxe_reg_wr_ind(sc, addr, val); } static void ecore_write_dmae_phys_len(struct bxe_softc *sc, bus_addr_t phys_addr, uint32_t addr, uint32_t len) { bxe_write_dmae_phys_len(sc, phys_addr, addr, len); } void ecore_storm_memset_struct(struct bxe_softc *sc, uint32_t addr, size_t size, uint32_t *data) { uint8_t i; for (i = 0; i < size/4; i++) { REG_WR(sc, addr + (i * 4), data[i]); } } Index: head/sys/dev/cxgb/cxgb_sge.c =================================================================== --- head/sys/dev/cxgb/cxgb_sge.c (revision 275357) +++ head/sys/dev/cxgb/cxgb_sge.c (revision 275358) @@ -1,3718 +1,3720 @@ /************************************************************************** Copyright (c) 2007-2009, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int txq_fills = 0; int multiq_tx_enable = 1; #ifdef TCP_OFFLOAD CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS); #endif extern struct sysctl_oid_list sysctl__hw_cxgb_children; int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE; SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0, "size of per-queue mbuf ring"); static int cxgb_tx_coalesce_force = 0; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN, &cxgb_tx_coalesce_force, 0, "coalesce small packets into a single work request regardless of ring state"); #define COALESCE_START_DEFAULT TX_ETH_Q_SIZE>>1 #define COALESCE_START_MAX (TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3)) #define COALESCE_STOP_DEFAULT TX_ETH_Q_SIZE>>2 #define COALESCE_STOP_MIN TX_ETH_Q_SIZE>>5 #define TX_RECLAIM_DEFAULT TX_ETH_Q_SIZE>>5 #define TX_RECLAIM_MAX TX_ETH_Q_SIZE>>2 #define TX_RECLAIM_MIN TX_ETH_Q_SIZE>>6 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN, &cxgb_tx_coalesce_enable_start, 0, "coalesce enable threshold"); static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN, &cxgb_tx_coalesce_enable_stop, 0, "coalesce disable threshold"); static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN, &cxgb_tx_reclaim_threshold, 0, "tx cleaning minimum threshold"); /* * XXX don't re-enable this until TOE stops assuming * we have an m_ext */ static int recycle_enable = 0; extern int cxgb_use_16k_clusters; extern int nmbjumbop; extern int nmbjumbo9; extern int nmbjumbo16; #define USE_GTS 0 #define SGE_RX_SM_BUF_SIZE 1536 #define SGE_RX_DROP_THRES 16 #define SGE_RX_COPY_THRES 128 /* * Period of the Tx buffer reclaim timer. This timer does not need to run * frequently as Tx buffers are usually reclaimed by new Tx packets. */ #define TX_RECLAIM_PERIOD (hz >> 1) /* * Values for sge_txq.flags */ enum { TXQ_RUNNING = 1 << 0, /* fetch engine is running */ TXQ_LAST_PKT_DB = 1 << 1, /* last packet rang the doorbell */ }; struct tx_desc { uint64_t flit[TX_DESC_FLITS]; } __packed; struct rx_desc { uint32_t addr_lo; uint32_t len_gen; uint32_t gen2; uint32_t addr_hi; } __packed; struct rsp_desc { /* response queue descriptor */ struct rss_header rss_hdr; uint32_t flags; uint32_t len_cq; uint8_t imm_data[47]; uint8_t intr_gen; } __packed; #define RX_SW_DESC_MAP_CREATED (1 << 0) #define TX_SW_DESC_MAP_CREATED (1 << 1) #define RX_SW_DESC_INUSE (1 << 3) #define TX_SW_DESC_MAPPED (1 << 4) #define RSPQ_NSOP_NEOP G_RSPD_SOP_EOP(0) #define RSPQ_EOP G_RSPD_SOP_EOP(F_RSPD_EOP) #define RSPQ_SOP G_RSPD_SOP_EOP(F_RSPD_SOP) #define RSPQ_SOP_EOP G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP) struct tx_sw_desc { /* SW state per Tx descriptor */ struct mbuf *m; bus_dmamap_t map; int flags; }; struct rx_sw_desc { /* SW state per Rx descriptor */ caddr_t rxsd_cl; struct mbuf *m; bus_dmamap_t map; int flags; }; struct txq_state { unsigned int compl; unsigned int gen; unsigned int pidx; }; struct refill_fl_cb_arg { int error; bus_dma_segment_t seg; int nseg; }; /* * Maps a number of flits to the number of Tx descriptors that can hold them. * The formula is * * desc = 1 + (flits - 2) / (WR_FLITS - 1). * * HW allows up to 4 descriptors to be combined into a WR. */ static uint8_t flit_desc_map[] = { 0, #if SGE_NUM_GENBITS == 1 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 #elif SGE_NUM_GENBITS == 2 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, #else # error "SGE_NUM_GENBITS must be 1 or 2" #endif }; #define TXQ_LOCK_ASSERT(qs) mtx_assert(&(qs)->lock, MA_OWNED) #define TXQ_TRYLOCK(qs) mtx_trylock(&(qs)->lock) #define TXQ_LOCK(qs) mtx_lock(&(qs)->lock) #define TXQ_UNLOCK(qs) mtx_unlock(&(qs)->lock) #define TXQ_RING_EMPTY(qs) drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_NEEDS_ENQUEUE(qs) \ drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_FLUSH(qs) drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_DEQUEUE_COND(qs, func, arg) \ drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg) #define TXQ_RING_DEQUEUE(qs) \ drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) int cxgb_debug = 0; static void sge_timer_cb(void *arg); static void sge_timer_reclaim(void *arg, int ncount); static void sge_txq_reclaim_handler(void *arg, int ncount); static void cxgb_start_locked(struct sge_qset *qs); /* * XXX need to cope with bursty scheduling by looking at a wider * window than we are now for determining the need for coalescing * */ static __inline uint64_t check_pkt_coalesce(struct sge_qset *qs) { struct adapter *sc; struct sge_txq *txq; uint8_t *fill; if (__predict_false(cxgb_tx_coalesce_force)) return (1); txq = &qs->txq[TXQ_ETH]; sc = qs->port->adapter; fill = &sc->tunq_fill[qs->idx]; if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX) cxgb_tx_coalesce_enable_start = COALESCE_START_MAX; if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN) cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN; /* * if the hardware transmit queue is more than 1/8 full * we mark it as coalescing - we drop back from coalescing * when we go below 1/32 full and there are no packets enqueued, * this provides us with some degree of hysteresis */ if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) && TXQ_RING_EMPTY(qs) && (qs->coalescing == 0)) *fill = 0; else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start)) *fill = 1; return (sc->tunq_coalesce); } #ifdef __LP64__ static void set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo) { uint64_t wr_hilo; #if _BYTE_ORDER == _LITTLE_ENDIAN wr_hilo = wr_hi; wr_hilo |= (((uint64_t)wr_lo)<<32); #else wr_hilo = wr_lo; wr_hilo |= (((uint64_t)wr_hi)<<32); #endif wrp->wrh_hilo = wr_hilo; } #else static void set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo) { wrp->wrh_hi = wr_hi; wmb(); wrp->wrh_lo = wr_lo; } #endif struct coalesce_info { int count; int nbytes; }; static int coalesce_check(struct mbuf *m, void *arg) { struct coalesce_info *ci = arg; int *count = &ci->count; int *nbytes = &ci->nbytes; if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) && (*count < 7) && (m->m_next == NULL))) { *count += 1; *nbytes += m->m_len; return (1); } return (0); } static struct mbuf * cxgb_dequeue(struct sge_qset *qs) { struct mbuf *m, *m_head, *m_tail; struct coalesce_info ci; if (check_pkt_coalesce(qs) == 0) return TXQ_RING_DEQUEUE(qs); m_head = m_tail = NULL; ci.count = ci.nbytes = 0; do { m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci); if (m_head == NULL) { m_tail = m_head = m; } else if (m != NULL) { m_tail->m_nextpkt = m; m_tail = m; } } while (m != NULL); if (ci.count > 7) panic("trying to coalesce %d packets in to one WR", ci.count); return (m_head); } /** * reclaim_completed_tx - reclaims completed Tx descriptors * @adapter: the adapter * @q: the Tx queue to reclaim completed descriptors from * * Reclaims Tx descriptors that the SGE has indicated it has processed, * and frees the associated buffers if possible. Called with the Tx * queue's lock held. */ static __inline int reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue) { struct sge_txq *q = &qs->txq[queue]; int reclaim = desc_reclaimable(q); if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) || (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN)) cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT; if (reclaim < reclaim_min) return (0); mtx_assert(&qs->lock, MA_OWNED); if (reclaim > 0) { t3_free_tx_desc(qs, reclaim, queue); q->cleaned += reclaim; q->in_use -= reclaim; } if (isset(&qs->txq_stopped, TXQ_ETH)) clrbit(&qs->txq_stopped, TXQ_ETH); return (reclaim); } /** * should_restart_tx - are there enough resources to restart a Tx queue? * @q: the Tx queue * * Checks if there are enough descriptors to restart a suspended Tx queue. */ static __inline int should_restart_tx(const struct sge_txq *q) { unsigned int r = q->processed - q->cleaned; return q->in_use - r < (q->size >> 1); } /** * t3_sge_init - initialize SGE * @adap: the adapter * @p: the SGE parameters * * Performs SGE initialization needed every time after a chip reset. * We do not initialize any of the queue sets here, instead the driver * top-level must request those individually. We also do not enable DMA * here, that should be done after the queues have been set up. */ void t3_sge_init(adapter_t *adap, struct sge_params *p) { u_int ctrl, ups; ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */ ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL | F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN | V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS | V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING; #if SGE_NUM_GENBITS == 1 ctrl |= F_EGRGENCTRL; #endif if (adap->params.rev > 0) { if (!(adap->flags & (USING_MSIX | USING_MSI))) ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ; } t3_write_reg(adap, A_SG_CONTROL, ctrl); t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) | V_LORCQDRBTHRSH(512)); t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10); t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) | V_TIMEOUT(200 * core_ticks_per_usec(adap))); t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, adap->params.rev < T3_REV_C ? 1000 : 500); t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256); t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000); t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256); t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff)); t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024); } /** * sgl_len - calculates the size of an SGL of the given capacity * @n: the number of SGL entries * * Calculates the number of flits needed for a scatter/gather list that * can hold the given number of entries. */ static __inline unsigned int sgl_len(unsigned int n) { return ((3 * n) / 2 + (n & 1)); } /** * get_imm_packet - return the next ingress packet buffer from a response * @resp: the response descriptor containing the packet data * * Return a packet containing the immediate data of the given response. */ static int get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m) { if (resp->rss_hdr.opcode == CPL_RX_DATA) { const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0]; m->m_len = sizeof(*cpl) + ntohs(cpl->len); } else if (resp->rss_hdr.opcode == CPL_RX_PKT) { const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0]; m->m_len = sizeof(*cpl) + ntohs(cpl->len); } else m->m_len = IMMED_PKT_SIZE; m->m_ext.ext_buf = NULL; m->m_ext.ext_type = 0; memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len); return (0); } static __inline u_int flits_to_desc(u_int n) { return (flit_desc_map[n]); } #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \ F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \ V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \ F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \ F_HIRCQPARITYERROR) #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR) #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \ F_RSPQDISABLED) /** * t3_sge_err_intr_handler - SGE async event interrupt handler * @adapter: the adapter * * Interrupt handler for SGE asynchronous (non-data) events. */ void t3_sge_err_intr_handler(adapter_t *adapter) { unsigned int v, status; status = t3_read_reg(adapter, A_SG_INT_CAUSE); if (status & SGE_PARERR) CH_ALERT(adapter, "SGE parity error (0x%x)\n", status & SGE_PARERR); if (status & SGE_FRAMINGERR) CH_ALERT(adapter, "SGE framing error (0x%x)\n", status & SGE_FRAMINGERR); if (status & F_RSPQCREDITOVERFOW) CH_ALERT(adapter, "SGE response queue credit overflow\n"); if (status & F_RSPQDISABLED) { v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS); CH_ALERT(adapter, "packet delivered to disabled response queue (0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff); } t3_write_reg(adapter, A_SG_INT_CAUSE, status); if (status & SGE_FATALERR) t3_fatal_err(adapter); } void t3_sge_prep(adapter_t *adap, struct sge_params *p) { int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size; nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus); nqsets *= adap->params.nports; fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE); while (!powerof2(fl_q_size)) fl_q_size--; use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters : is_offload(adap); #if __FreeBSD_version >= 700111 if (use_16k) { jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUM16BYTES; } else { jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUM9BYTES; } #else jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUMPAGESIZE; #endif while (!powerof2(jumbo_q_size)) jumbo_q_size--; if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2)) device_printf(adap->dev, "Insufficient clusters and/or jumbo buffers.\n"); p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data); for (i = 0; i < SGE_QSETS; ++i) { struct qset_params *q = p->qset + i; if (adap->params.nports > 2) { q->coalesce_usecs = 50; } else { #ifdef INVARIANTS q->coalesce_usecs = 10; #else q->coalesce_usecs = 5; #endif } q->polling = 0; q->rspq_size = RSPQ_Q_SIZE; q->fl_size = fl_q_size; q->jumbo_size = jumbo_q_size; q->jumbo_buf_size = jumbo_buf_size; q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE; q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16; q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE; q->cong_thres = 0; } } int t3_sge_alloc(adapter_t *sc) { /* The parent tag. */ if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */ 1, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ BUS_SPACE_UNRESTRICTED, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */ 0, /* flags */ NULL, NULL, /* lock, lockarg */ &sc->parent_dmat)) { device_printf(sc->dev, "Cannot allocate parent DMA tag\n"); return (ENOMEM); } /* * DMA tag for normal sized RX frames */ if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) { device_printf(sc->dev, "Cannot allocate RX DMA tag\n"); return (ENOMEM); } /* * DMA tag for jumbo sized RX frames. */ if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) { device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n"); return (ENOMEM); } /* * DMA tag for TX frames. */ if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS, TX_MAX_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->tx_dmat)) { device_printf(sc->dev, "Cannot allocate TX DMA tag\n"); return (ENOMEM); } return (0); } int t3_sge_free(struct adapter * sc) { if (sc->tx_dmat != NULL) bus_dma_tag_destroy(sc->tx_dmat); if (sc->rx_jumbo_dmat != NULL) bus_dma_tag_destroy(sc->rx_jumbo_dmat); if (sc->rx_dmat != NULL) bus_dma_tag_destroy(sc->rx_dmat); if (sc->parent_dmat != NULL) bus_dma_tag_destroy(sc->parent_dmat); return (0); } void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p) { qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U); qs->rspq.polling = 0 /* p->polling */; } #if !defined(__i386__) && !defined(__amd64__) static void refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct refill_fl_cb_arg *cb_arg = arg; cb_arg->error = error; cb_arg->seg = segs[0]; cb_arg->nseg = nseg; } #endif /** * refill_fl - refill an SGE free-buffer list * @sc: the controller softc * @q: the free-list to refill * @n: the number of new buffers to allocate * * (Re)populate an SGE free-buffer list with up to @n new packet buffers. * The caller must assure that @n does not exceed the queue's capacity. */ static void refill_fl(adapter_t *sc, struct sge_fl *q, int n) { struct rx_sw_desc *sd = &q->sdesc[q->pidx]; struct rx_desc *d = &q->desc[q->pidx]; struct refill_fl_cb_arg cb_arg; struct mbuf *m; caddr_t cl; int err; cb_arg.error = 0; while (n--) { /* * We allocate an uninitialized mbuf + cluster, mbuf is * initialized after rx. */ if (q->zone == zone_pack) { if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL) break; cl = m->m_ext.ext_buf; } else { if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL) break; if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { uma_zfree(q->zone, cl); break; } } if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) { if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) { log(LOG_WARNING, "bus_dmamap_create failed %d\n", err); uma_zfree(q->zone, cl); goto done; } sd->flags |= RX_SW_DESC_MAP_CREATED; } #if !defined(__i386__) && !defined(__amd64__) err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size, refill_fl_cb, &cb_arg, 0); if (err != 0 || cb_arg.error) { if (q->zone == zone_pack) uma_zfree(q->zone, cl); m_free(m); goto done; } #else cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl); #endif sd->flags |= RX_SW_DESC_INUSE; sd->rxsd_cl = cl; sd->m = m; d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff); d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff); d->len_gen = htobe32(V_FLD_GEN1(q->gen)); d->gen2 = htobe32(V_FLD_GEN2(q->gen)); d++; sd++; if (++q->pidx == q->size) { q->pidx = 0; q->gen ^= 1; sd = q->sdesc; d = q->desc; } q->credits++; q->db_pending++; } done: if (q->db_pending >= 32) { q->db_pending = 0; t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id)); } } /** * free_rx_bufs - free the Rx buffers on an SGE free list * @sc: the controle softc * @q: the SGE free list to clean up * * Release the buffers on an SGE free-buffer Rx queue. HW fetching from * this queue should be stopped before calling this function. */ static void free_rx_bufs(adapter_t *sc, struct sge_fl *q) { u_int cidx = q->cidx; while (q->credits--) { struct rx_sw_desc *d = &q->sdesc[cidx]; if (d->flags & RX_SW_DESC_INUSE) { bus_dmamap_unload(q->entry_tag, d->map); bus_dmamap_destroy(q->entry_tag, d->map); if (q->zone == zone_pack) { m_init(d->m, zone_pack, MCLBYTES, M_NOWAIT, MT_DATA, M_EXT); uma_zfree(zone_pack, d->m); } else { m_init(d->m, zone_mbuf, MLEN, M_NOWAIT, MT_DATA, 0); uma_zfree(zone_mbuf, d->m); uma_zfree(q->zone, d->rxsd_cl); } } d->rxsd_cl = NULL; d->m = NULL; if (++cidx == q->size) cidx = 0; } } static __inline void __refill_fl(adapter_t *adap, struct sge_fl *fl) { refill_fl(adap, fl, min(16U, fl->size - fl->credits)); } static __inline void __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max) { uint32_t reclaimable = fl->size - fl->credits; if (reclaimable > 0) refill_fl(adap, fl, min(max, reclaimable)); } /** * recycle_rx_buf - recycle a receive buffer * @adapter: the adapter * @q: the SGE free list * @idx: index of buffer to recycle * * Recycles the specified buffer on the given free list by adding it at * the next available slot on the list. */ static void recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx) { struct rx_desc *from = &q->desc[idx]; struct rx_desc *to = &q->desc[q->pidx]; q->sdesc[q->pidx] = q->sdesc[idx]; to->addr_lo = from->addr_lo; // already big endian to->addr_hi = from->addr_hi; // likewise wmb(); /* necessary ? */ to->len_gen = htobe32(V_FLD_GEN1(q->gen)); to->gen2 = htobe32(V_FLD_GEN2(q->gen)); q->credits++; if (++q->pidx == q->size) { q->pidx = 0; q->gen ^= 1; } t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id)); } static void alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { uint32_t *addr; addr = arg; *addr = segs[0].ds_addr; } static int alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size, bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag) { size_t len = nelem * elem_size; void *s = NULL; void *p = NULL; int err; if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0, BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor tag\n"); return (ENOMEM); } if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT, map)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor memory\n"); return (ENOMEM); } bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0); bzero(p, len); *(void **)desc = p; if (sw_size) { len = nelem * sw_size; s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); *(void **)sdesc = s; } if (parent_entry_tag == NULL) return (0); if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS, TX_MAX_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL, entry_tag)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor entry tag\n"); return (ENOMEM); } return (0); } static void sge_slow_intr_handler(void *arg, int ncount) { adapter_t *sc = arg; t3_slow_intr_handler(sc); t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask); (void) t3_read_reg(sc, A_PL_INT_ENABLE0); } /** * sge_timer_cb - perform periodic maintenance of an SGE qset * @data: the SGE queue set to maintain * * Runs periodically from a timer to perform maintenance of an SGE queue * set. It performs two tasks: * * a) Cleans up any completed Tx descriptors that may still be pending. * Normal descriptor cleanup happens when new packets are added to a Tx * queue so this timer is relatively infrequent and does any cleanup only * if the Tx queue has not seen any new packets in a while. We make a * best effort attempt to reclaim descriptors, in that we don't wait * around if we cannot get a queue's lock (which most likely is because * someone else is queueing new packets and so will also handle the clean * up). Since control queues use immediate data exclusively we don't * bother cleaning them up here. * * b) Replenishes Rx queues that have run out due to memory shortage. * Normally new Rx buffers are added when existing ones are consumed but * when out of memory a queue can become empty. We try to add only a few * buffers here, the queue will be replenished fully as these new buffers * are used up if memory shortage has subsided. * * c) Return coalesced response queue credits in case a response queue is * starved. * * d) Ring doorbells for T304 tunnel queues since we have seen doorbell * fifo overflows and the FW doesn't implement any recovery scheme yet. */ static void sge_timer_cb(void *arg) { adapter_t *sc = arg; if ((sc->flags & USING_MSIX) == 0) { struct port_info *pi; struct sge_qset *qs; struct sge_txq *txq; int i, j; int reclaim_ofl, refill_rx; if (sc->open_device_map == 0) return; for (i = 0; i < sc->params.nports; i++) { pi = &sc->port[i]; for (j = 0; j < pi->nqsets; j++) { qs = &sc->sge.qs[pi->first_qset + j]; txq = &qs->txq[0]; reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned; refill_rx = ((qs->fl[0].credits < qs->fl[0].size) || (qs->fl[1].credits < qs->fl[1].size)); if (reclaim_ofl || refill_rx) { taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task); break; } } } } if (sc->params.nports > 2) { int i; for_each_port(sc, i) { struct port_info *pi = &sc->port[i]; t3_write_reg(sc, A_SG_KDOORBELL, F_SELEGRCNTX | (FW_TUNNEL_SGEEC_START + pi->first_qset)); } } if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) && sc->open_device_map != 0) callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); } /* * This is meant to be a catch-all function to keep sge state private * to sge.c * */ int t3_sge_init_adapter(adapter_t *sc) { callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE); callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc); return (0); } int t3_sge_reset_adapter(adapter_t *sc) { callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); return (0); } int t3_sge_init_port(struct port_info *pi) { TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi); return (0); } /** * refill_rspq - replenish an SGE response queue * @adapter: the adapter * @q: the response queue to replenish * @credits: how many new responses to make available * * Replenishes a response queue by making the supplied number of responses * available to HW. */ static __inline void refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits) { /* mbufs are allocated on demand when a rspq entry is processed. */ t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN, V_RSPQ(q->cntxt_id) | V_CREDITS(credits)); } static void sge_txq_reclaim_handler(void *arg, int ncount) { struct sge_qset *qs = arg; int i; for (i = 0; i < 3; i++) reclaim_completed_tx(qs, 16, i); } static void sge_timer_reclaim(void *arg, int ncount) { struct port_info *pi = arg; int i, nqsets = pi->nqsets; adapter_t *sc = pi->adapter; struct sge_qset *qs; struct mtx *lock; KASSERT((sc->flags & USING_MSIX) == 0, ("can't call timer reclaim for msi-x")); for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[pi->first_qset + i]; reclaim_completed_tx(qs, 16, TXQ_OFLD); lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock : &sc->sge.qs[0].rspq.lock; if (mtx_trylock(lock)) { /* XXX currently assume that we are *NOT* polling */ uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS); if (qs->fl[0].credits < qs->fl[0].size - 16) __refill_fl(sc, &qs->fl[0]); if (qs->fl[1].credits < qs->fl[1].size - 16) __refill_fl(sc, &qs->fl[1]); if (status & (1 << qs->rspq.cntxt_id)) { if (qs->rspq.credits) { refill_rspq(sc, &qs->rspq, 1); qs->rspq.credits--; t3_write_reg(sc, A_SG_RSPQ_FL_STATUS, 1 << qs->rspq.cntxt_id); } } mtx_unlock(lock); } } } /** * init_qset_cntxt - initialize an SGE queue set context info * @qs: the queue set * @id: the queue set id * * Initializes the TIDs and context ids for the queues of a queue set. */ static void init_qset_cntxt(struct sge_qset *qs, u_int id) { qs->rspq.cntxt_id = id; qs->fl[0].cntxt_id = 2 * id; qs->fl[1].cntxt_id = 2 * id + 1; qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id; qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id; qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id; qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id; qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id; mbufq_init(&qs->txq[TXQ_ETH].sendq); mbufq_init(&qs->txq[TXQ_OFLD].sendq); mbufq_init(&qs->txq[TXQ_CTRL].sendq); } static void txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs) { txq->in_use += ndesc; /* * XXX we don't handle stopping of queue * presumably start handles this when we bump against the end */ txqs->gen = txq->gen; txq->unacked += ndesc; txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5); txq->unacked &= 31; txqs->pidx = txq->pidx; txq->pidx += ndesc; #ifdef INVARIANTS if (((txqs->pidx > txq->cidx) && (txq->pidx < txqs->pidx) && (txq->pidx >= txq->cidx)) || ((txqs->pidx < txq->cidx) && (txq->pidx >= txq-> cidx)) || ((txqs->pidx < txq->cidx) && (txq->cidx < txqs->pidx))) panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d", txqs->pidx, txq->pidx, txq->cidx); #endif if (txq->pidx >= txq->size) { txq->pidx -= txq->size; txq->gen ^= 1; } } /** * calc_tx_descs - calculate the number of Tx descriptors for a packet * @m: the packet mbufs * @nsegs: the number of segments * * Returns the number of Tx descriptors needed for the given Ethernet * packet. Ethernet packets require addition of WR and CPL headers. */ static __inline unsigned int calc_tx_descs(const struct mbuf *m, int nsegs) { unsigned int flits; if (m->m_pkthdr.len <= PIO_LEN) return 1; flits = sgl_len(nsegs) + 2; if (m->m_pkthdr.csum_flags & CSUM_TSO) flits++; return flits_to_desc(flits); } /** * make_sgl - populate a scatter/gather list for a packet * @sgp: the SGL to populate * @segs: the packet dma segments * @nsegs: the number of segments * * Generates a scatter/gather list for the buffers that make up a packet * and returns the SGL size in 8-byte words. The caller must size the SGL * appropriately. */ static __inline void make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs) { int i, idx; for (idx = 0, i = 0; i < nsegs; i++) { /* * firmware doesn't like empty segments */ if (segs[i].ds_len == 0) continue; if (i && idx == 0) ++sgp; sgp->len[idx] = htobe32(segs[i].ds_len); sgp->addr[idx] = htobe64(segs[i].ds_addr); idx ^= 1; } if (idx) { sgp->len[idx] = 0; sgp->addr[idx] = 0; } } /** * check_ring_tx_db - check and potentially ring a Tx queue's doorbell * @adap: the adapter * @q: the Tx queue * * Ring the doorbell if a Tx queue is asleep. There is a natural race, * where the HW is going to sleep just after we checked, however, * then the interrupt handler will detect the outstanding TX packet * and ring the doorbell for us. * * When GTS is disabled we unconditionally ring the doorbell. */ static __inline void check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring) { #if USE_GTS clear_bit(TXQ_LAST_PKT_DB, &q->flags); if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) { set_bit(TXQ_LAST_PKT_DB, &q->flags); #ifdef T3_TRACE T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d", q->cntxt_id); #endif t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } #else if (mustring || ++q->db_pending >= 32) { wmb(); /* write descriptors before telling HW */ t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); q->db_pending = 0; } #endif } static __inline void wr_gen2(struct tx_desc *d, unsigned int gen) { #if SGE_NUM_GENBITS == 2 d->flit[TX_DESC_FLITS - 1] = htobe64(gen); #endif } /** * write_wr_hdr_sgl - write a WR header and, optionally, SGL * @ndesc: number of Tx descriptors spanned by the SGL * @txd: first Tx descriptor to be written * @txqs: txq state (generation and producer index) * @txq: the SGE Tx queue * @sgl: the SGL * @flits: number of flits to the start of the SGL in the first descriptor * @sgl_flits: the SGL size in flits * @wr_hi: top 32 bits of WR header based on WR type (big endian) * @wr_lo: low 32 bits of WR header based on WR type (big endian) * * Write a work request header and an associated SGL. If the SGL is * small enough to fit into one Tx descriptor it has already been written * and we just need to write the WR header. Otherwise we distribute the * SGL across the number of descriptors it spans. */ static void write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs, const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits, unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo) { struct work_request_hdr *wrp = (struct work_request_hdr *)txd; struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx]; if (__predict_true(ndesc == 1)) { set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | wr_hi, htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) | wr_lo); wr_gen2(txd, txqs->gen); } else { unsigned int ogen = txqs->gen; const uint64_t *fp = (const uint64_t *)sgl; struct work_request_hdr *wp = wrp; wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | wr_hi; while (sgl_flits) { unsigned int avail = WR_FLITS - flits; if (avail > sgl_flits) avail = sgl_flits; memcpy(&txd->flit[flits], fp, avail * sizeof(*fp)); sgl_flits -= avail; ndesc--; if (!sgl_flits) break; fp += avail; txd++; txsd++; if (++txqs->pidx == txq->size) { txqs->pidx = 0; txqs->gen ^= 1; txd = txq->desc; txsd = txq->sdesc; } /* * when the head of the mbuf chain * is freed all clusters will be freed * with it */ wrp = (struct work_request_hdr *)txd; wrp->wrh_hi = htonl(V_WR_DATATYPE(1) | V_WR_SGLSFLT(1)) | wr_hi; wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS, sgl_flits + 1)) | V_WR_GEN(txqs->gen)) | wr_lo; wr_gen2(txd, txqs->gen); flits = 1; } wrp->wrh_hi |= htonl(F_WR_EOP); wmb(); wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo; wr_gen2((struct tx_desc *)wp, ogen); } } /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */ #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20) #define GET_VTAG(cntrl, m) \ do { \ if ((m)->m_flags & M_VLANTAG) \ cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \ } while (0) static int t3_encap(struct sge_qset *qs, struct mbuf **m) { adapter_t *sc; struct mbuf *m0; struct sge_txq *txq; struct txq_state txqs; struct port_info *pi; unsigned int ndesc, flits, cntrl, mlen; int err, nsegs, tso_info = 0; struct work_request_hdr *wrp; struct tx_sw_desc *txsd; struct sg_ent *sgp, *sgl; uint32_t wr_hi, wr_lo, sgl_flits; bus_dma_segment_t segs[TX_MAX_SEGS]; struct tx_desc *txd; pi = qs->port; sc = pi->adapter; txq = &qs->txq[TXQ_ETH]; txd = &txq->desc[txq->pidx]; txsd = &txq->sdesc[txq->pidx]; sgl = txq->txq_sgl; prefetch(txd); m0 = *m; mtx_assert(&qs->lock, MA_OWNED); cntrl = V_TXPKT_INTF(pi->txpkt_intf); KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n")); if (m0->m_nextpkt == NULL && m0->m_next != NULL && m0->m_pkthdr.csum_flags & (CSUM_TSO)) tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz); if (m0->m_nextpkt != NULL) { busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs); ndesc = 1; mlen = 0; } else { if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map, &m0, segs, &nsegs))) { if (cxgb_debug) printf("failed ... err=%d\n", err); return (err); } mlen = m0->m_pkthdr.len; ndesc = calc_tx_descs(m0, nsegs); } txq_prod(txq, ndesc, &txqs); KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs)); txsd->m = m0; if (m0->m_nextpkt != NULL) { struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd; int i, fidx; if (nsegs > 7) panic("trying to coalesce %d packets in to one WR", nsegs); txq->txq_coalesced += nsegs; wrp = (struct work_request_hdr *)txd; flits = nsegs*2 + 1; for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) { struct cpl_tx_pkt_batch_entry *cbe; uint64_t flit; uint32_t *hflit = (uint32_t *)&flit; int cflags = m0->m_pkthdr.csum_flags; cntrl = V_TXPKT_INTF(pi->txpkt_intf); GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); if (__predict_false(!(cflags & CSUM_IP))) cntrl |= F_TXPKT_IPCSUM_DIS; if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6)))) cntrl |= F_TXPKT_L4CSUM_DIS; hflit[0] = htonl(cntrl); hflit[1] = htonl(segs[i].ds_len | 0x80000000); flit |= htobe64(1 << 24); cbe = &cpl_batch->pkt_entry[i]; cbe->cntrl = hflit[0]; cbe->len = hflit[1]; cbe->addr = htobe64(segs[i].ds_addr); } wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token)); set_wr_hdr(wrp, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); return (0); } else if (tso_info) { uint16_t eth_type; struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd; struct ether_header *eh; void *l3hdr; struct tcphdr *tcp; txd->flit[2] = 0; GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO); hdr->cntrl = htonl(cntrl); hdr->len = htonl(mlen | 0x80000000); if (__predict_false(mlen < TCPPKTHDRSIZE)) { printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x", m0, mlen, m0->m_pkthdr.tso_segsz, (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags); panic("tx tso packet too small"); } /* Make sure that ether, ip, tcp headers are all in m0 */ if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) { m0 = m_pullup(m0, TCPPKTHDRSIZE); if (__predict_false(m0 == NULL)) { /* XXX panic probably an overreaction */ panic("couldn't fit header into mbuf"); } } eh = mtod(m0, struct ether_header *); eth_type = eh->ether_type; if (eth_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *evh = (void *)eh; tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN); l3hdr = evh + 1; eth_type = evh->evl_proto; } else { tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II); l3hdr = eh + 1; } if (eth_type == htons(ETHERTYPE_IP)) { struct ip *ip = l3hdr; tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl); tcp = (struct tcphdr *)(ip + 1); } else if (eth_type == htons(ETHERTYPE_IPV6)) { struct ip6_hdr *ip6 = l3hdr; KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO with ip6_nxt %d", __func__, ip6->ip6_nxt)); tso_info |= F_LSO_IPV6; tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2); tcp = (struct tcphdr *)(ip6 + 1); } else panic("%s: CSUM_TSO but neither ip nor ip6", __func__); tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off); hdr->lso_info = htonl(tso_info); if (__predict_false(mlen <= PIO_LEN)) { /* * pkt not undersized but fits in PIO_LEN * Indicates a TSO bug at the higher levels. */ txsd->m = NULL; m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]); flits = (mlen + 7) / 8 + 3; wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | F_WR_SOP | F_WR_EOP | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen) | V_WR_TID(txq->token)); set_wr_hdr(&hdr->wr, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); m_freem(m0); return (0); } flits = 3; } else { struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd; GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP))) cntrl |= F_TXPKT_IPCSUM_DIS; if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6)))) cntrl |= F_TXPKT_L4CSUM_DIS; cpl->cntrl = htonl(cntrl); cpl->len = htonl(mlen | 0x80000000); if (mlen <= PIO_LEN) { txsd->m = NULL; m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]); flits = (mlen + 7) / 8 + 2; wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | F_WR_SOP | F_WR_EOP | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen) | V_WR_TID(txq->token)); set_wr_hdr(&cpl->wr, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); m_freem(m0); return (0); } flits = 2; } wrp = (struct work_request_hdr *)txd; sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl; make_sgl(sgp, segs, nsegs); sgl_flits = sgl_len(nsegs); ETHER_BPF_MTAP(pi->ifp, m0); KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc)); wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); wr_lo = htonl(V_WR_TID(txq->token)); write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo); check_ring_tx_db(sc, txq, 0); return (0); } void cxgb_tx_watchdog(void *arg) { struct sge_qset *qs = arg; struct sge_txq *txq = &qs->txq[TXQ_ETH]; if (qs->coalescing != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) && TXQ_RING_EMPTY(qs)) qs->coalescing = 0; else if (qs->coalescing == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start)) qs->coalescing = 1; if (TXQ_TRYLOCK(qs)) { qs->qs_flags |= QS_FLUSHING; cxgb_start_locked(qs); qs->qs_flags &= ~QS_FLUSHING; TXQ_UNLOCK(qs); } if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING) callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog, qs, txq->txq_watchdog.c_cpu); } static void cxgb_tx_timeout(void *arg) { struct sge_qset *qs = arg; struct sge_txq *txq = &qs->txq[TXQ_ETH]; if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3))) qs->coalescing = 1; if (TXQ_TRYLOCK(qs)) { qs->qs_flags |= QS_TIMEOUT; cxgb_start_locked(qs); qs->qs_flags &= ~QS_TIMEOUT; TXQ_UNLOCK(qs); } } static void cxgb_start_locked(struct sge_qset *qs) { struct mbuf *m_head = NULL; struct sge_txq *txq = &qs->txq[TXQ_ETH]; struct port_info *pi = qs->port; struct ifnet *ifp = pi->ifp; if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT)) reclaim_completed_tx(qs, 0, TXQ_ETH); if (!pi->link_config.link_ok) { TXQ_RING_FLUSH(qs); return; } TXQ_LOCK_ASSERT(qs); while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) && pi->link_config.link_ok) { reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH); if (txq->size - txq->in_use <= TX_MAX_DESC) break; if ((m_head = cxgb_dequeue(qs)) == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (t3_encap(qs, &m_head) || m_head == NULL) break; m_head = NULL; } if (txq->db_pending) check_ring_tx_db(pi->adapter, txq, 1); if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 && pi->link_config.link_ok) callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout, qs, txq->txq_timer.c_cpu); if (m_head != NULL) m_freem(m_head); } static int cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m) { struct port_info *pi = qs->port; struct sge_txq *txq = &qs->txq[TXQ_ETH]; struct buf_ring *br = txq->txq_mr; int error, avail; avail = txq->size - txq->in_use; TXQ_LOCK_ASSERT(qs); /* * We can only do a direct transmit if the following are true: * - we aren't coalescing (ring < 3/4 full) * - the link is up -- checked in caller * - there are no packets enqueued already * - there is space in hardware transmit queue */ if (check_pkt_coalesce(qs) == 0 && !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) { if (t3_encap(qs, &m)) { if (m != NULL && (error = drbr_enqueue(ifp, br, m)) != 0) return (error); } else { if (txq->db_pending) check_ring_tx_db(pi->adapter, txq, 1); /* * We've bypassed the buf ring so we need to update * the stats directly */ txq->txq_direct_packets++; txq->txq_direct_bytes += m->m_pkthdr.len; } } else if ((error = drbr_enqueue(ifp, br, m)) != 0) return (error); reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH); if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok && (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7))) cxgb_start_locked(qs); else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer)) callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout, qs, txq->txq_timer.c_cpu); return (0); } int cxgb_transmit(struct ifnet *ifp, struct mbuf *m) { struct sge_qset *qs; struct port_info *pi = ifp->if_softc; int error, qidx = pi->first_qset; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||(!pi->link_config.link_ok)) { m_freem(m); return (0); } - - if (m->m_flags & M_FLOWID) + + /* check if flowid is set */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset; qs = &pi->adapter->sge.qs[qidx]; if (TXQ_TRYLOCK(qs)) { /* XXX running */ error = cxgb_transmit_locked(ifp, qs, m); TXQ_UNLOCK(qs); } else error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m); return (error); } void cxgb_qflush(struct ifnet *ifp) { /* * flush any enqueued mbufs in the buf_rings * and in the transmit queues * no-op for now */ return; } /** * write_imm - write a packet into a Tx descriptor as immediate data * @d: the Tx descriptor to write * @m: the packet * @len: the length of packet data to write as immediate data * @gen: the generation bit value to write * * Writes a packet as immediate data into a Tx descriptor. The packet * contains a work request at its beginning. We must write the packet * carefully so the SGE doesn't read accidentally before it's written in * its entirety. */ static __inline void write_imm(struct tx_desc *d, caddr_t src, unsigned int len, unsigned int gen) { struct work_request_hdr *from = (struct work_request_hdr *)src; struct work_request_hdr *to = (struct work_request_hdr *)d; uint32_t wr_hi, wr_lo; KASSERT(len <= WR_LEN && len >= sizeof(*from), ("%s: invalid len %d", __func__, len)); memcpy(&to[1], &from[1], len - sizeof(*from)); wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP | V_WR_BCNTLFLT(len & 7)); wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8)); set_wr_hdr(to, wr_hi, wr_lo); wmb(); wr_gen2(d, gen); } /** * check_desc_avail - check descriptor availability on a send queue * @adap: the adapter * @q: the TX queue * @m: the packet needing the descriptors * @ndesc: the number of Tx descriptors needed * @qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL) * * Checks if the requested number of Tx descriptors is available on an * SGE send queue. If the queue is already suspended or not enough * descriptors are available the packet is queued for later transmission. * Must be called with the Tx queue locked. * * Returns 0 if enough descriptors are available, 1 if there aren't * enough descriptors and the packet has been queued, and 2 if the caller * needs to retry because there weren't enough descriptors at the * beginning of the call but some freed up in the mean time. */ static __inline int check_desc_avail(adapter_t *adap, struct sge_txq *q, struct mbuf *m, unsigned int ndesc, unsigned int qid) { /* * XXX We currently only use this for checking the control queue * the control queue is only used for binding qsets which happens * at init time so we are guaranteed enough descriptors */ if (__predict_false(!mbufq_empty(&q->sendq))) { addq_exit: mbufq_tail(&q->sendq, m); return 1; } if (__predict_false(q->size - q->in_use < ndesc)) { struct sge_qset *qs = txq_to_qset(q, qid); setbit(&qs->txq_stopped, qid); if (should_restart_tx(q) && test_and_clear_bit(qid, &qs->txq_stopped)) return 2; q->stops++; goto addq_exit; } return 0; } /** * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs * @q: the SGE control Tx queue * * This is a variant of reclaim_completed_tx() that is used for Tx queues * that send only immediate data (presently just the control queues) and * thus do not have any mbufs */ static __inline void reclaim_completed_tx_imm(struct sge_txq *q) { unsigned int reclaim = q->processed - q->cleaned; q->in_use -= reclaim; q->cleaned += reclaim; } /** * ctrl_xmit - send a packet through an SGE control Tx queue * @adap: the adapter * @q: the control queue * @m: the packet * * Send a packet through an SGE control Tx queue. Packets sent through * a control queue must fit entirely as immediate data in a single Tx * descriptor and have no page fragments. */ static int ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) { int ret; struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *); struct sge_txq *q = &qs->txq[TXQ_CTRL]; KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__)); wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP); wrp->wrh_lo = htonl(V_WR_TID(q->token)); TXQ_LOCK(qs); again: reclaim_completed_tx_imm(q); ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL); if (__predict_false(ret)) { if (ret == 1) { TXQ_UNLOCK(qs); return (ENOSPC); } goto again; } write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); q->in_use++; if (++q->pidx >= q->size) { q->pidx = 0; q->gen ^= 1; } TXQ_UNLOCK(qs); wmb(); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); m_free(m); return (0); } /** * restart_ctrlq - restart a suspended control queue * @qs: the queue set cotaining the control queue * * Resumes transmission on a suspended Tx control queue. */ static void restart_ctrlq(void *data, int npending) { struct mbuf *m; struct sge_qset *qs = (struct sge_qset *)data; struct sge_txq *q = &qs->txq[TXQ_CTRL]; adapter_t *adap = qs->port->adapter; TXQ_LOCK(qs); again: reclaim_completed_tx_imm(q); while (q->in_use < q->size && (m = mbufq_dequeue(&q->sendq)) != NULL) { write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); m_free(m); if (++q->pidx >= q->size) { q->pidx = 0; q->gen ^= 1; } q->in_use++; } if (!mbufq_empty(&q->sendq)) { setbit(&qs->txq_stopped, TXQ_CTRL); if (should_restart_tx(q) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) goto again; q->stops++; } TXQ_UNLOCK(qs); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } /* * Send a management message through control queue 0 */ int t3_mgmt_tx(struct adapter *adap, struct mbuf *m) { return ctrl_xmit(adap, &adap->sge.qs[0], m); } /** * free_qset - free the resources of an SGE queue set * @sc: the controller owning the queue set * @q: the queue set * * Release the HW and SW resources associated with an SGE queue set, such * as HW contexts, packet buffers, and descriptor rings. Traffic to the * queue set must be quiesced prior to calling this. */ static void t3_free_qset(adapter_t *sc, struct sge_qset *q) { int i; reclaim_completed_tx(q, 0, TXQ_ETH); if (q->txq[TXQ_ETH].txq_mr != NULL) buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF); if (q->txq[TXQ_ETH].txq_ifq != NULL) { ifq_delete(q->txq[TXQ_ETH].txq_ifq); free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF); } for (i = 0; i < SGE_RXQ_PER_SET; ++i) { if (q->fl[i].desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_disable_fl(sc, q->fl[i].cntxt_id); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map); bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc, q->fl[i].desc_map); bus_dma_tag_destroy(q->fl[i].desc_tag); bus_dma_tag_destroy(q->fl[i].entry_tag); } if (q->fl[i].sdesc) { free_rx_bufs(sc, &q->fl[i]); free(q->fl[i].sdesc, M_DEVBUF); } } mtx_unlock(&q->lock); MTX_DESTROY(&q->lock); for (i = 0; i < SGE_TXQ_PER_SET; i++) { if (q->txq[i].desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->txq[i].desc_tag, q->txq[i].desc_map); bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc, q->txq[i].desc_map); bus_dma_tag_destroy(q->txq[i].desc_tag); bus_dma_tag_destroy(q->txq[i].entry_tag); } if (q->txq[i].sdesc) { free(q->txq[i].sdesc, M_DEVBUF); } } if (q->rspq.desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map); bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc, q->rspq.desc_map); bus_dma_tag_destroy(q->rspq.desc_tag); MTX_DESTROY(&q->rspq.lock); } #if defined(INET6) || defined(INET) tcp_lro_free(&q->lro.ctrl); #endif bzero(q, sizeof(*q)); } /** * t3_free_sge_resources - free SGE resources * @sc: the adapter softc * * Frees resources used by the SGE queue sets. */ void t3_free_sge_resources(adapter_t *sc, int nqsets) { int i; for (i = 0; i < nqsets; ++i) { TXQ_LOCK(&sc->sge.qs[i]); t3_free_qset(sc, &sc->sge.qs[i]); } } /** * t3_sge_start - enable SGE * @sc: the controller softc * * Enables the SGE for DMAs. This is the last step in starting packet * transfers. */ void t3_sge_start(adapter_t *sc) { t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE); } /** * t3_sge_stop - disable SGE operation * @sc: the adapter * * Disables the DMA engine. This can be called in emeregencies (e.g., * from error interrupts) or from normal process context. In the latter * case it also disables any pending queue restart tasklets. Note that * if it is called in interrupt context it cannot disable the restart * tasklets as it cannot wait, however the tasklets will have no effect * since the doorbells are disabled and the driver will call this again * later from process context, at which time the tasklets will be stopped * if they are still running. */ void t3_sge_stop(adapter_t *sc) { int i, nqsets; t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0); if (sc->tq == NULL) return; for (nqsets = i = 0; i < (sc)->params.nports; i++) nqsets += sc->port[i].nqsets; #ifdef notyet /* * * XXX */ for (i = 0; i < nqsets; ++i) { struct sge_qset *qs = &sc->sge.qs[i]; taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task); taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task); } #endif } /** * t3_free_tx_desc - reclaims Tx descriptors and their buffers * @adapter: the adapter * @q: the Tx queue to reclaim descriptors from * @reclaimable: the number of descriptors to reclaim * @m_vec_size: maximum number of buffers to reclaim * @desc_reclaimed: returns the number of descriptors reclaimed * * Reclaims Tx descriptors from an SGE Tx queue and frees the associated * Tx buffers. Called with the Tx queue lock held. * * Returns number of buffers of reclaimed */ void t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue) { struct tx_sw_desc *txsd; unsigned int cidx, mask; struct sge_txq *q = &qs->txq[queue]; #ifdef T3_TRACE T3_TRACE2(sc->tb[q->cntxt_id & 7], "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx); #endif cidx = q->cidx; mask = q->size - 1; txsd = &q->sdesc[cidx]; mtx_assert(&qs->lock, MA_OWNED); while (reclaimable--) { prefetch(q->sdesc[(cidx + 1) & mask].m); prefetch(q->sdesc[(cidx + 2) & mask].m); if (txsd->m != NULL) { if (txsd->flags & TX_SW_DESC_MAPPED) { bus_dmamap_unload(q->entry_tag, txsd->map); txsd->flags &= ~TX_SW_DESC_MAPPED; } m_freem_list(txsd->m); txsd->m = NULL; } else q->txq_skipped++; ++txsd; if (++cidx == q->size) { cidx = 0; txsd = q->sdesc; } } q->cidx = cidx; } /** * is_new_response - check if a response is newly written * @r: the response descriptor * @q: the response queue * * Returns true if a response descriptor contains a yet unprocessed * response. */ static __inline int is_new_response(const struct rsp_desc *r, const struct sge_rspq *q) { return (r->intr_gen & F_RSPD_GEN2) == q->gen; } #define RSPD_GTS_MASK (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS) #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \ V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \ V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \ V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR)) /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */ #define NOMEM_INTR_DELAY 2500 #ifdef TCP_OFFLOAD /** * write_ofld_wr - write an offload work request * @adap: the adapter * @m: the packet to send * @q: the Tx queue * @pidx: index of the first Tx descriptor to write * @gen: the generation value to use * @ndesc: number of descriptors the packet will occupy * * Write an offload work request to send the supplied packet. The packet * data already carry the work request with most fields populated. */ static void write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q, unsigned int pidx, unsigned int gen, unsigned int ndesc) { unsigned int sgl_flits, flits; int i, idx, nsegs, wrlen; struct work_request_hdr *from; struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1]; struct tx_desc *d = &q->desc[pidx]; struct txq_state txqs; struct sglist_seg *segs; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); struct sglist *sgl; from = (void *)(oh + 1); /* Start of WR within mbuf */ wrlen = m->m_len - sizeof(*oh); if (!(oh->flags & F_HDR_SGL)) { write_imm(d, (caddr_t)from, wrlen, gen); /* * mbuf with "real" immediate tx data will be enqueue_wr'd by * t3_push_frames and freed in wr_ack. Others, like those sent * down by close_conn, t3_send_reset, etc. should be freed here. */ if (!(oh->flags & F_HDR_DF)) m_free(m); return; } memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from)); sgl = oh->sgl; flits = wrlen / 8; sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl; nsegs = sgl->sg_nseg; segs = sgl->sg_segs; for (idx = 0, i = 0; i < nsegs; i++) { KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__)); if (i && idx == 0) ++sgp; sgp->len[idx] = htobe32(segs[i].ss_len); sgp->addr[idx] = htobe64(segs[i].ss_paddr); idx ^= 1; } if (idx) { sgp->len[idx] = 0; sgp->addr[idx] = 0; } sgl_flits = sgl_len(nsegs); txqs.gen = gen; txqs.pidx = pidx; txqs.compl = 0; write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits, from->wrh_hi, from->wrh_lo); } /** * ofld_xmit - send a packet through an offload queue * @adap: the adapter * @q: the Tx offload queue * @m: the packet * * Send an offload packet through an SGE offload queue. */ static int ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) { int ret; unsigned int ndesc; unsigned int pidx, gen; struct sge_txq *q = &qs->txq[TXQ_OFLD]; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); ndesc = G_HDR_NDESC(oh->flags); TXQ_LOCK(qs); again: reclaim_completed_tx(qs, 16, TXQ_OFLD); ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD); if (__predict_false(ret)) { if (ret == 1) { TXQ_UNLOCK(qs); return (EINTR); } goto again; } gen = q->gen; q->in_use += ndesc; pidx = q->pidx; q->pidx += ndesc; if (q->pidx >= q->size) { q->pidx -= q->size; q->gen ^= 1; } write_ofld_wr(adap, m, q, pidx, gen, ndesc); check_ring_tx_db(adap, q, 1); TXQ_UNLOCK(qs); return (0); } /** * restart_offloadq - restart a suspended offload queue * @qs: the queue set cotaining the offload queue * * Resumes transmission on a suspended Tx offload queue. */ static void restart_offloadq(void *data, int npending) { struct mbuf *m; struct sge_qset *qs = data; struct sge_txq *q = &qs->txq[TXQ_OFLD]; adapter_t *adap = qs->port->adapter; int cleaned; TXQ_LOCK(qs); again: cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD); while ((m = mbufq_peek(&q->sendq)) != NULL) { unsigned int gen, pidx; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); unsigned int ndesc = G_HDR_NDESC(oh->flags); if (__predict_false(q->size - q->in_use < ndesc)) { setbit(&qs->txq_stopped, TXQ_OFLD); if (should_restart_tx(q) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) goto again; q->stops++; break; } gen = q->gen; q->in_use += ndesc; pidx = q->pidx; q->pidx += ndesc; if (q->pidx >= q->size) { q->pidx -= q->size; q->gen ^= 1; } (void)mbufq_dequeue(&q->sendq); TXQ_UNLOCK(qs); write_ofld_wr(adap, m, q, pidx, gen, ndesc); TXQ_LOCK(qs); } #if USE_GTS set_bit(TXQ_RUNNING, &q->flags); set_bit(TXQ_LAST_PKT_DB, &q->flags); #endif TXQ_UNLOCK(qs); wmb(); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } /** * t3_offload_tx - send an offload packet * @m: the packet * * Sends an offload packet. We use the packet priority to select the * appropriate Tx queue as follows: bit 0 indicates whether the packet * should be sent as regular or control, bits 1-3 select the queue set. */ int t3_offload_tx(struct adapter *sc, struct mbuf *m) { struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)]; if (oh->flags & F_HDR_CTRL) { m_adj(m, sizeof (*oh)); /* trim ofld_hdr off */ return (ctrl_xmit(sc, qs, m)); } else return (ofld_xmit(sc, qs, m)); } #endif static void restart_tx(struct sge_qset *qs) { struct adapter *sc = qs->port->adapter; if (isset(&qs->txq_stopped, TXQ_OFLD) && should_restart_tx(&qs->txq[TXQ_OFLD]) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) { qs->txq[TXQ_OFLD].restarts++; taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task); } if (isset(&qs->txq_stopped, TXQ_CTRL) && should_restart_tx(&qs->txq[TXQ_CTRL]) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) { qs->txq[TXQ_CTRL].restarts++; taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task); } } /** * t3_sge_alloc_qset - initialize an SGE queue set * @sc: the controller softc * @id: the queue set id * @nports: how many Ethernet ports will be using this queue set * @irq_vec_idx: the IRQ vector index for response queue interrupts * @p: configuration parameters for this queue set * @ntxq: number of Tx queues for the queue set * @pi: port info for queue set * * Allocate resources and initialize an SGE queue set. A queue set * comprises a response queue, two Rx free-buffer queues, and up to 3 * Tx queues. The Tx queues are assigned roles in the order Ethernet * queue, offload queue, and control queue. */ int t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, const struct qset_params *p, int ntxq, struct port_info *pi) { struct sge_qset *q = &sc->sge.qs[id]; int i, ret = 0; MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF); q->port = pi; q->adap = sc; if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size, M_DEVBUF, M_WAITOK, &q->lock)) == NULL) { device_printf(sc->dev, "failed to allocate mbuf ring\n"); goto err; } if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT | M_ZERO)) == NULL) { device_printf(sc->dev, "failed to allocate ifq\n"); goto err; } ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp); callout_init(&q->txq[TXQ_ETH].txq_timer, 1); callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1); q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus; q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus; init_qset_cntxt(q, id); q->idx = id; if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc), sizeof(struct rx_sw_desc), &q->fl[0].phys_addr, &q->fl[0].desc, &q->fl[0].sdesc, &q->fl[0].desc_tag, &q->fl[0].desc_map, sc->rx_dmat, &q->fl[0].entry_tag)) != 0) { printf("error %d from alloc ring fl0\n", ret); goto err; } if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc), sizeof(struct rx_sw_desc), &q->fl[1].phys_addr, &q->fl[1].desc, &q->fl[1].sdesc, &q->fl[1].desc_tag, &q->fl[1].desc_map, sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) { printf("error %d from alloc ring fl1\n", ret); goto err; } if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0, &q->rspq.phys_addr, &q->rspq.desc, NULL, &q->rspq.desc_tag, &q->rspq.desc_map, NULL, NULL)) != 0) { printf("error %d from alloc ring rspq\n", ret); goto err; } snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d", device_get_unit(sc->dev), irq_vec_idx); MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF); for (i = 0; i < ntxq; ++i) { size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc); if ((ret = alloc_ring(sc, p->txq_size[i], sizeof(struct tx_desc), sz, &q->txq[i].phys_addr, &q->txq[i].desc, &q->txq[i].sdesc, &q->txq[i].desc_tag, &q->txq[i].desc_map, sc->tx_dmat, &q->txq[i].entry_tag)) != 0) { printf("error %d from alloc ring tx %i\n", ret, i); goto err; } mbufq_init(&q->txq[i].sendq); q->txq[i].gen = 1; q->txq[i].size = p->txq_size[i]; } #ifdef TCP_OFFLOAD TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q); #endif TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q); TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q); TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q); q->fl[0].gen = q->fl[1].gen = 1; q->fl[0].size = p->fl_size; q->fl[1].size = p->jumbo_size; q->rspq.gen = 1; q->rspq.cidx = 0; q->rspq.size = p->rspq_size; q->txq[TXQ_ETH].stop_thres = nports * flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3); q->fl[0].buf_size = MCLBYTES; q->fl[0].zone = zone_pack; q->fl[0].type = EXT_PACKET; if (p->jumbo_buf_size == MJUM16BYTES) { q->fl[1].zone = zone_jumbo16; q->fl[1].type = EXT_JUMBO16; } else if (p->jumbo_buf_size == MJUM9BYTES) { q->fl[1].zone = zone_jumbo9; q->fl[1].type = EXT_JUMBO9; } else if (p->jumbo_buf_size == MJUMPAGESIZE) { q->fl[1].zone = zone_jumbop; q->fl[1].type = EXT_JUMBOP; } else { KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size)); ret = EDOOFUS; goto err; } q->fl[1].buf_size = p->jumbo_buf_size; /* Allocate and setup the lro_ctrl structure */ q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO); #if defined(INET6) || defined(INET) ret = tcp_lro_init(&q->lro.ctrl); if (ret) { printf("error %d from tcp_lro_init\n", ret); goto err; } #endif q->lro.ctrl.ifp = pi->ifp; mtx_lock_spin(&sc->sge.reg_lock); ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx, q->rspq.phys_addr, q->rspq.size, q->fl[0].buf_size, 1, 0); if (ret) { printf("error %d from t3_sge_init_rspcntxt\n", ret); goto err_unlock; } for (i = 0; i < SGE_RXQ_PER_SET; ++i) { ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0, q->fl[i].phys_addr, q->fl[i].size, q->fl[i].buf_size, p->cong_thres, 1, 0); if (ret) { printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i); goto err_unlock; } } ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS, SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr, q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } if (ntxq > 1) { ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id, USE_GTS, SGE_CNTXT_OFLD, id, q->txq[TXQ_OFLD].phys_addr, q->txq[TXQ_OFLD].size, 0, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } } if (ntxq > 2) { ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0, SGE_CNTXT_CTRL, id, q->txq[TXQ_CTRL].phys_addr, q->txq[TXQ_CTRL].size, q->txq[TXQ_CTRL].token, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } } mtx_unlock_spin(&sc->sge.reg_lock); t3_update_qset_coalesce(q, p); refill_fl(sc, &q->fl[0], q->fl[0].size); refill_fl(sc, &q->fl[1], q->fl[1].size); refill_rspq(sc, &q->rspq, q->rspq.size - 1); t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) | V_NEWTIMER(q->rspq.holdoff_tmr)); return (0); err_unlock: mtx_unlock_spin(&sc->sge.reg_lock); err: TXQ_LOCK(q); t3_free_qset(sc, q); return (ret); } /* * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with * ethernet data. Hardware assistance with various checksums and any vlan tag * will also be taken into account here. */ void t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad) { struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad); struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]]; struct ifnet *ifp = pi->ifp; if (cpl->vlan_valid) { m->m_pkthdr.ether_vtag = ntohs(cpl->vlan); m->m_flags |= M_VLANTAG; } m->m_pkthdr.rcvif = ifp; /* * adjust after conversion to mbuf chain */ m->m_pkthdr.len -= (sizeof(*cpl) + ethpad); m->m_len -= (sizeof(*cpl) + ethpad); m->m_data += (sizeof(*cpl) + ethpad); if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) { struct ether_header *eh = mtod(m, void *); uint16_t eh_type; if (eh->ether_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *evh = mtod(m, void *); eh_type = evh->evl_proto; } else eh_type = eh->ether_type; if (ifp->if_capenable & IFCAP_RXCSUM && eh_type == htons(ETHERTYPE_IP)) { m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && eh_type == htons(ETHERTYPE_IPV6)) { m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } } /** * get_packet - return the next ingress packet buffer from a free list * @adap: the adapter that received the packet * @drop_thres: # of remaining buffers before we start dropping packets * @qs: the qset that the SGE free list holding the packet belongs to * @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain * @r: response descriptor * * Get the next packet from a free list and complete setup of the * sk_buff. If the packet is small we make a copy and recycle the * original buffer, otherwise we use the original buffer itself. If a * positive drop threshold is supplied packets are dropped and their * buffers recycled if (a) the number of remaining buffers is under the * threshold and the packet is too big to copy, or (b) the packet should * be copied but there is no memory for the copy. */ static int get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs, struct t3_mbuf_hdr *mh, struct rsp_desc *r) { unsigned int len_cq = ntohl(r->len_cq); struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0]; int mask, cidx = fl->cidx; struct rx_sw_desc *sd = &fl->sdesc[cidx]; uint32_t len = G_RSPD_LEN(len_cq); uint32_t flags = M_EXT; uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags)); caddr_t cl; struct mbuf *m; int ret = 0; mask = fl->size - 1; prefetch(fl->sdesc[(cidx + 1) & mask].m); prefetch(fl->sdesc[(cidx + 2) & mask].m); prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl); prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl); fl->credits--; bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD); if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) { if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) goto skip_recycle; cl = mtod(m, void *); memcpy(cl, sd->rxsd_cl, len); recycle_rx_buf(adap, fl, fl->cidx); m->m_pkthdr.len = m->m_len = len; m->m_flags = 0; mh->mh_head = mh->mh_tail = m; ret = 1; goto done; } else { skip_recycle: bus_dmamap_unload(fl->entry_tag, sd->map); cl = sd->rxsd_cl; m = sd->m; if ((sopeop == RSPQ_SOP_EOP) || (sopeop == RSPQ_SOP)) flags |= M_PKTHDR; m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags); if (fl->zone == zone_pack) { /* * restore clobbered data pointer */ m->m_data = m->m_ext.ext_buf; } else { m_cljset(m, cl, fl->type); } m->m_len = len; } switch(sopeop) { case RSPQ_SOP_EOP: ret = 1; /* FALLTHROUGH */ case RSPQ_SOP: mh->mh_head = mh->mh_tail = m; m->m_pkthdr.len = len; break; case RSPQ_EOP: ret = 1; /* FALLTHROUGH */ case RSPQ_NSOP_NEOP: if (mh->mh_tail == NULL) { log(LOG_ERR, "discarding intermediate descriptor entry\n"); m_freem(m); break; } mh->mh_tail->m_next = m; mh->mh_tail = m; mh->mh_head->m_pkthdr.len += len; break; } if (cxgb_debug) printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len); done: if (++fl->cidx == fl->size) fl->cidx = 0; return (ret); } /** * handle_rsp_cntrl_info - handles control information in a response * @qs: the queue set corresponding to the response * @flags: the response control flags * * Handles the control information of an SGE response, such as GTS * indications and completion credits for the queue set's Tx queues. * HW coalesces credits, we don't do any extra SW coalescing. */ static __inline void handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags) { unsigned int credits; #if USE_GTS if (flags & F_RSPD_TXQ0_GTS) clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags); #endif credits = G_RSPD_TXQ0_CR(flags); if (credits) qs->txq[TXQ_ETH].processed += credits; credits = G_RSPD_TXQ2_CR(flags); if (credits) qs->txq[TXQ_CTRL].processed += credits; # if USE_GTS if (flags & F_RSPD_TXQ1_GTS) clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags); # endif credits = G_RSPD_TXQ1_CR(flags); if (credits) qs->txq[TXQ_OFLD].processed += credits; } static void check_ring_db(adapter_t *adap, struct sge_qset *qs, unsigned int sleeping) { ; } /** * process_responses - process responses from an SGE response queue * @adap: the adapter * @qs: the queue set to which the response queue belongs * @budget: how many responses can be processed in this round * * Process responses from an SGE response queue up to the supplied budget. * Responses include received packets as well as credits and other events * for the queues that belong to the response queue's queue set. * A negative budget is effectively unlimited. * * Additionally choose the interrupt holdoff time for the next interrupt * on this queue. If the system is under memory shortage use a fairly * long delay to help recovery. */ static int process_responses(adapter_t *adap, struct sge_qset *qs, int budget) { struct sge_rspq *rspq = &qs->rspq; struct rsp_desc *r = &rspq->desc[rspq->cidx]; int budget_left = budget; unsigned int sleeping = 0; #if defined(INET6) || defined(INET) int lro_enabled = qs->lro.enabled; int skip_lro; struct lro_ctrl *lro_ctrl = &qs->lro.ctrl; #endif struct t3_mbuf_hdr *mh = &rspq->rspq_mh; #ifdef DEBUG static int last_holdoff = 0; if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) { printf("next_holdoff=%d\n", rspq->holdoff_tmr); last_holdoff = rspq->holdoff_tmr; } #endif rspq->next_holdoff = rspq->holdoff_tmr; while (__predict_true(budget_left && is_new_response(r, rspq))) { int eth, eop = 0, ethpad = 0; uint32_t flags = ntohl(r->flags); uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val); uint8_t opcode = r->rss_hdr.opcode; eth = (opcode == CPL_RX_PKT); if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) { struct mbuf *m; if (cxgb_debug) printf("async notification\n"); if (mh->mh_head == NULL) { mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA); m = mh->mh_head; } else { m = m_gethdr(M_NOWAIT, MT_DATA); } if (m == NULL) goto no_mem; memcpy(mtod(m, char *), r, AN_PKT_SIZE); m->m_len = m->m_pkthdr.len = AN_PKT_SIZE; *mtod(m, char *) = CPL_ASYNC_NOTIF; opcode = CPL_ASYNC_NOTIF; eop = 1; rspq->async_notif++; goto skip; } else if (flags & F_RSPD_IMM_DATA_VALID) { struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { no_mem: rspq->next_holdoff = NOMEM_INTR_DELAY; budget_left--; break; } if (mh->mh_head == NULL) mh->mh_head = m; else mh->mh_tail->m_next = m; mh->mh_tail = m; get_imm_packet(adap, r, m); mh->mh_head->m_pkthdr.len += m->m_len; eop = 1; rspq->imm_data++; } else if (r->len_cq) { int drop_thresh = eth ? SGE_RX_DROP_THRES : 0; eop = get_packet(adap, drop_thresh, qs, mh, r); if (eop) { - if (r->rss_hdr.hash_type && !adap->timestamp) - mh->mh_head->m_flags |= M_FLOWID; - mh->mh_head->m_pkthdr.flowid = rss_hash; + if (r->rss_hdr.hash_type && !adap->timestamp) { + M_HASHTYPE_SET(mh->mh_head, M_HASHTYPE_OPAQUE); + mh->mh_head->m_pkthdr.flowid = rss_hash; + } } ethpad = 2; } else { rspq->pure_rsps++; } skip: if (flags & RSPD_CTRL_MASK) { sleeping |= flags & RSPD_GTS_MASK; handle_rsp_cntrl_info(qs, flags); } if (!eth && eop) { rspq->offload_pkts++; #ifdef TCP_OFFLOAD adap->cpl_handler[opcode](qs, r, mh->mh_head); #else m_freem(mh->mh_head); #endif mh->mh_head = NULL; } else if (eth && eop) { struct mbuf *m = mh->mh_head; t3_rx_eth(adap, m, ethpad); /* * The T304 sends incoming packets on any qset. If LRO * is also enabled, we could end up sending packet up * lro_ctrl->ifp's input. That is incorrect. * * The mbuf's rcvif was derived from the cpl header and * is accurate. Skip LRO and just use that. */ #if defined(INET6) || defined(INET) skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif); if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro && (tcp_lro_rx(lro_ctrl, m, 0) == 0) ) { /* successfully queue'd for LRO */ } else #endif { /* * LRO not enabled, packet unsuitable for LRO, * or unable to queue. Pass it up right now in * either case. */ struct ifnet *ifp = m->m_pkthdr.rcvif; (*ifp->if_input)(ifp, m); } mh->mh_head = NULL; } r++; if (__predict_false(++rspq->cidx == rspq->size)) { rspq->cidx = 0; rspq->gen ^= 1; r = rspq->desc; } if (++rspq->credits >= 64) { refill_rspq(adap, rspq, rspq->credits); rspq->credits = 0; } __refill_fl_lt(adap, &qs->fl[0], 32); __refill_fl_lt(adap, &qs->fl[1], 32); --budget_left; } #if defined(INET6) || defined(INET) /* Flush LRO */ while (!SLIST_EMPTY(&lro_ctrl->lro_active)) { struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active); SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next); tcp_lro_flush(lro_ctrl, queued); } #endif if (sleeping) check_ring_db(adap, qs, sleeping); mb(); /* commit Tx queue processed updates */ if (__predict_false(qs->txq_stopped > 1)) restart_tx(qs); __refill_fl_lt(adap, &qs->fl[0], 512); __refill_fl_lt(adap, &qs->fl[1], 512); budget -= budget_left; return (budget); } /* * A helper function that processes responses and issues GTS. */ static __inline int process_responses_gts(adapter_t *adap, struct sge_rspq *rq) { int work; static int last_holdoff = 0; work = process_responses(adap, rspq_to_qset(rq), -1); if (cxgb_debug && (rq->next_holdoff != last_holdoff)) { printf("next_holdoff=%d\n", rq->next_holdoff); last_holdoff = rq->next_holdoff; } t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) | V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx)); return (work); } /* * Interrupt handler for legacy INTx interrupts for T3B-based cards. * Handles data events from SGE response queues as well as error and other * async events as they all use the same interrupt pin. We use one SGE * response queue per port in this mode and protect all response queues with * queue 0's lock. */ void t3b_intr(void *data) { uint32_t i, map; adapter_t *adap = data; struct sge_rspq *q0 = &adap->sge.qs[0].rspq; t3_write_reg(adap, A_PL_CLI, 0); map = t3_read_reg(adap, A_SG_DATA_INTR); if (!map) return; if (__predict_false(map & F_ERRINTR)) { t3_write_reg(adap, A_PL_INT_ENABLE0, 0); (void) t3_read_reg(adap, A_PL_INT_ENABLE0); taskqueue_enqueue(adap->tq, &adap->slow_intr_task); } mtx_lock(&q0->lock); for_each_port(adap, i) if (map & (1 << i)) process_responses_gts(adap, &adap->sge.qs[i].rspq); mtx_unlock(&q0->lock); } /* * The MSI interrupt handler. This needs to handle data events from SGE * response queues as well as error and other async events as they all use * the same MSI vector. We use one SGE response queue per port in this mode * and protect all response queues with queue 0's lock. */ void t3_intr_msi(void *data) { adapter_t *adap = data; struct sge_rspq *q0 = &adap->sge.qs[0].rspq; int i, new_packets = 0; mtx_lock(&q0->lock); for_each_port(adap, i) if (process_responses_gts(adap, &adap->sge.qs[i].rspq)) new_packets = 1; mtx_unlock(&q0->lock); if (new_packets == 0) { t3_write_reg(adap, A_PL_INT_ENABLE0, 0); (void) t3_read_reg(adap, A_PL_INT_ENABLE0); taskqueue_enqueue(adap->tq, &adap->slow_intr_task); } } void t3_intr_msix(void *data) { struct sge_qset *qs = data; adapter_t *adap = qs->port->adapter; struct sge_rspq *rspq = &qs->rspq; if (process_responses_gts(adap, rspq) == 0) rspq->unhandled_irqs++; } #define QDUMP_SBUF_SIZE 32 * 400 static int t3_dump_rspq(SYSCTL_HANDLER_ARGS) { struct sge_rspq *rspq; struct sge_qset *qs; int i, err, dump_end, idx; struct sbuf *sb; struct rsp_desc *rspd; uint32_t data[4]; rspq = arg1; qs = rspq_to_qset(rspq); if (rspq->rspq_dump_count == 0) return (0); if (rspq->rspq_dump_count > RSPQ_Q_SIZE) { log(LOG_WARNING, "dump count is too large %d\n", rspq->rspq_dump_count); rspq->rspq_dump_count = 0; return (EINVAL); } if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", rspq->rspq_dump_start); rspq->rspq_dump_start = 0; return (EINVAL); } err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data); if (err) return (err); err = sysctl_wire_old_buffer(req, 0); if (err) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n", (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f), ((data[2] >> 26) & 1), ((data[2] >> 27) & 1)); sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n", ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]); sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start, (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1)); dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count; for (i = rspq->rspq_dump_start; i < dump_end; i++) { idx = i & (RSPQ_Q_SIZE-1); rspd = &rspq->desc[idx]; sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n", idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx, rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx)); sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n", rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags), be32toh(rspd->len_cq), rspd->intr_gen); } err = sbuf_finish(sb); /* Output a trailing NUL. */ if (err == 0) err = SYSCTL_OUT(req, "", 1); sbuf_delete(sb); return (err); } static int t3_dump_txq_eth(SYSCTL_HANDLER_ARGS) { struct sge_txq *txq; struct sge_qset *qs; int i, j, err, dump_end; struct sbuf *sb; struct tx_desc *txd; uint32_t *WR, wr_hi, wr_lo, gen; uint32_t data[4]; txq = arg1; qs = txq_to_qset(txq, TXQ_ETH); if (txq->txq_dump_count == 0) { return (0); } if (txq->txq_dump_count > TX_ETH_Q_SIZE) { log(LOG_WARNING, "dump count is too large %d\n", txq->txq_dump_count); txq->txq_dump_count = 1; return (EINVAL); } if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", txq->txq_dump_start); txq->txq_dump_start = 0; return (EINVAL); } err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data); if (err) return (err); err = sysctl_wire_old_buffer(req, 0); if (err) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n", (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16), (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1)); sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n", ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1), ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1)); sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx, txq->txq_dump_start, (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1)); dump_end = txq->txq_dump_start + txq->txq_dump_count; for (i = txq->txq_dump_start; i < dump_end; i++) { txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)]; WR = (uint32_t *)txd->flit; wr_hi = ntohl(WR[0]); wr_lo = ntohl(WR[1]); gen = G_WR_GEN(wr_lo); sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n", wr_hi, wr_lo, gen); for (j = 2; j < 30; j += 4) sbuf_printf(sb, "\t%08x %08x %08x %08x \n", WR[j], WR[j + 1], WR[j + 2], WR[j + 3]); } err = sbuf_finish(sb); /* Output a trailing NUL. */ if (err == 0) err = SYSCTL_OUT(req, "", 1); sbuf_delete(sb); return (err); } static int t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS) { struct sge_txq *txq; struct sge_qset *qs; int i, j, err, dump_end; struct sbuf *sb; struct tx_desc *txd; uint32_t *WR, wr_hi, wr_lo, gen; txq = arg1; qs = txq_to_qset(txq, TXQ_CTRL); if (txq->txq_dump_count == 0) { return (0); } if (txq->txq_dump_count > 256) { log(LOG_WARNING, "dump count is too large %d\n", txq->txq_dump_count); txq->txq_dump_count = 1; return (EINVAL); } if (txq->txq_dump_start > 255) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", txq->txq_dump_start); txq->txq_dump_start = 0; return (EINVAL); } err = sysctl_wire_old_buffer(req, 0); if (err != 0) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx, txq->txq_dump_start, (txq->txq_dump_start + txq->txq_dump_count) & 255); dump_end = txq->txq_dump_start + txq->txq_dump_count; for (i = txq->txq_dump_start; i < dump_end; i++) { txd = &txq->desc[i & (255)]; WR = (uint32_t *)txd->flit; wr_hi = ntohl(WR[0]); wr_lo = ntohl(WR[1]); gen = G_WR_GEN(wr_lo); sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n", wr_hi, wr_lo, gen); for (j = 2; j < 30; j += 4) sbuf_printf(sb, "\t%08x %08x %08x %08x \n", WR[j], WR[j + 1], WR[j + 2], WR[j + 3]); } err = sbuf_finish(sb); /* Output a trailing NUL. */ if (err == 0) err = SYSCTL_OUT(req, "", 1); sbuf_delete(sb); return (err); } static int t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS) { adapter_t *sc = arg1; struct qset_params *qsp = &sc->params.sge.qset[0]; int coalesce_usecs; struct sge_qset *qs; int i, j, err, nqsets = 0; struct mtx *lock; if ((sc->flags & FULL_INIT_DONE) == 0) return (ENXIO); coalesce_usecs = qsp->coalesce_usecs; err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req); if (err != 0) { return (err); } if (coalesce_usecs == qsp->coalesce_usecs) return (0); for (i = 0; i < sc->params.nports; i++) for (j = 0; j < sc->port[i].nqsets; j++) nqsets++; coalesce_usecs = max(1, coalesce_usecs); for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[i]; qsp = &sc->params.sge.qset[i]; qsp->coalesce_usecs = coalesce_usecs; lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock : &sc->sge.qs[0].rspq.lock; mtx_lock(lock); t3_update_qset_coalesce(qs, qsp); t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) | V_NEWTIMER(qs->rspq.holdoff_tmr)); mtx_unlock(lock); } return (0); } static int t3_pkt_timestamp(SYSCTL_HANDLER_ARGS) { adapter_t *sc = arg1; int rc, timestamp; if ((sc->flags & FULL_INIT_DONE) == 0) return (ENXIO); timestamp = sc->timestamp; rc = sysctl_handle_int(oidp, ×tamp, arg2, req); if (rc != 0) return (rc); if (timestamp != sc->timestamp) { t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS, timestamp ? F_ENABLERXPKTTMSTPRSS : 0); sc->timestamp = timestamp; } return (0); } void t3_add_attach_sysctls(adapter_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); /* random information */ SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, &sc->params.rev, 0, "chip model"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "port_types", CTLFLAG_RD, sc->port_types, 0, "type of ports"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "enable_debug", CTLFLAG_RW, &cxgb_debug, 0, "enable verbose debugging output"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce", CTLFLAG_RD, &sc->tunq_coalesce, "#tunneled packets freed"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "txq_overrun", CTLFLAG_RD, &txq_fills, 0, "#times txq overrun"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, &sc->params.vpd.cclk, 0, "core clock frequency (in KHz)"); } static const char *rspq_name = "rspq"; static const char *txq_names[] = { "txq_eth", "txq_ofld", "txq_ctrl" }; static int sysctl_handle_macstat(SYSCTL_HANDLER_ARGS) { struct port_info *p = arg1; uint64_t *parg; if (!p) return (EINVAL); cxgb_refresh_stats(p); parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2); return (sysctl_handle_64(oidp, parg, 0, req)); } void t3_add_configured_sysctls(adapter_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; int i, j; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal", CTLTYPE_INT|CTLFLAG_RW, sc, 0, t3_set_coalesce_usecs, "I", "interrupt coalescing timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pkt_timestamp", CTLTYPE_INT | CTLFLAG_RW, sc, 0, t3_pkt_timestamp, "I", "provide packet timestamp instead of connection hash"); for (i = 0; i < sc->params.nports; i++) { struct port_info *pi = &sc->port[i]; struct sysctl_oid *poid; struct sysctl_oid_list *poidlist; struct mac_stats *mstats = &pi->mac.stats; snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i); poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, pi->namebuf, CTLFLAG_RD, NULL, "port statistics"); poidlist = SYSCTL_CHILDREN(poid); SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO, "nqsets", CTLFLAG_RD, &pi->nqsets, 0, "#queue sets"); for (j = 0; j < pi->nqsets; j++) { struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j]; struct sysctl_oid *qspoid, *rspqpoid, *txqpoid, *ctrlqpoid, *lropoid; struct sysctl_oid_list *qspoidlist, *rspqpoidlist, *txqpoidlist, *ctrlqpoidlist, *lropoidlist; struct sge_txq *txq = &qs->txq[TXQ_ETH]; snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j); qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, qs->namebuf, CTLFLAG_RD, NULL, "qset statistics"); qspoidlist = SYSCTL_CHILDREN(qspoid); SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty", CTLFLAG_RD, &qs->fl[0].empty, 0, "freelist #0 empty"); SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty", CTLFLAG_RD, &qs->fl[1].empty, 0, "freelist #1 empty"); rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, rspq_name, CTLFLAG_RD, NULL, "rspq statistics"); rspqpoidlist = SYSCTL_CHILDREN(rspqpoid); txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, txq_names[0], CTLFLAG_RD, NULL, "txq statistics"); txqpoidlist = SYSCTL_CHILDREN(txqpoid); ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics"); ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid); lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, "lro_stats", CTLFLAG_RD, NULL, "LRO statistics"); lropoidlist = SYSCTL_CHILDREN(lropoid); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size", CTLFLAG_RD, &qs->rspq.size, 0, "#entries in response queue"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx", CTLFLAG_RD, &qs->rspq.cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits", CTLFLAG_RD, &qs->rspq.credits, 0, "#credits"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved", CTLFLAG_RD, &qs->rspq.starved, 0, "#times starved"); SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr", CTLFLAG_RD, &qs->rspq.phys_addr, "physical_address_of the queue"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->rspq.rspq_dump_start, 0, "start rspq dump entry"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->rspq.rspq_dump_count, 0, "#rspq entries to dump"); SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq, 0, t3_dump_rspq, "A", "dump of the response queue"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped", CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops, "#tunneled packets dropped"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen", CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen, 0, "#tunneled packets waiting to be sent"); #if 0 SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod, 0, "#tunneled packets queue producer index"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons, 0, "#tunneled packets queue consumer index"); #endif SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed", CTLFLAG_RD, &qs->txq[TXQ_ETH].processed, 0, "#tunneled packets processed by the card"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned", CTLFLAG_RD, &txq->cleaned, 0, "#tunneled packets cleaned"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use", CTLFLAG_RD, &txq->in_use, 0, "#tunneled packet slots in use"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees", CTLFLAG_RD, &txq->txq_frees, "#tunneled packets freed"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped", CTLFLAG_RD, &txq->txq_skipped, 0, "#tunneled packet descriptors skipped"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced", CTLFLAG_RD, &txq->txq_coalesced, "#tunneled packets coalesced"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued", CTLFLAG_RD, &txq->txq_enqueued, 0, "#tunneled packets enqueued to hardware"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags", CTLFLAG_RD, &qs->txq_stopped, 0, "tx queues stopped"); SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr", CTLFLAG_RD, &txq->phys_addr, "physical_address_of the queue"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen", CTLFLAG_RW, &qs->txq[TXQ_ETH].gen, 0, "txq generation"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx", CTLFLAG_RD, &txq->cidx, 0, "hardware queue cidx"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx", CTLFLAG_RD, &txq->pidx, 0, "hardware queue pidx"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start, 0, "txq start idx for dump"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count, 0, "txq #entries to dump"); SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH], 0, t3_dump_txq_eth, "A", "dump of the transmit queue"); SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start, 0, "ctrlq start idx for dump"); SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count, 0, "ctrl #entries to dump"); SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL], 0, t3_dump_txq_ctrl, "A", "dump of the transmit queue"); SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued", CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL); SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed", CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL); SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum", CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL); SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt", CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL); } /* Now add a node for mac stats. */ poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC statistics"); poidlist = SYSCTL_CHILDREN(poid); /* * We (ab)use the length argument (arg2) to pass on the offset * of the data that we are interested in. This is only required * for the quad counters that are updated from the hardware (we * make sure that we return the latest value). * sysctl_handle_macstat first updates *all* the counters from * the hardware, and then returns the latest value of the * requested counter. Best would be to update only the * requested counter from hardware, but t3_mac_update_stats() * hides all the register details and we don't want to dive into * all that here. */ #define CXGB_SYSCTL_ADD_QUAD(a) SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \ (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \ sysctl_handle_macstat, "QU", 0) CXGB_SYSCTL_ADD_QUAD(tx_octets); CXGB_SYSCTL_ADD_QUAD(tx_octets_bad); CXGB_SYSCTL_ADD_QUAD(tx_frames); CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames); CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames); CXGB_SYSCTL_ADD_QUAD(tx_pause); CXGB_SYSCTL_ADD_QUAD(tx_deferred); CXGB_SYSCTL_ADD_QUAD(tx_late_collisions); CXGB_SYSCTL_ADD_QUAD(tx_total_collisions); CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions); CXGB_SYSCTL_ADD_QUAD(tx_underrun); CXGB_SYSCTL_ADD_QUAD(tx_len_errs); CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs); CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral); CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs); CXGB_SYSCTL_ADD_QUAD(tx_frames_64); CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127); CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255); CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511); CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023); CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518); CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max); CXGB_SYSCTL_ADD_QUAD(rx_octets); CXGB_SYSCTL_ADD_QUAD(rx_octets_bad); CXGB_SYSCTL_ADD_QUAD(rx_frames); CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames); CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames); CXGB_SYSCTL_ADD_QUAD(rx_pause); CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs); CXGB_SYSCTL_ADD_QUAD(rx_align_errs); CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs); CXGB_SYSCTL_ADD_QUAD(rx_data_errs); CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs); CXGB_SYSCTL_ADD_QUAD(rx_runt); CXGB_SYSCTL_ADD_QUAD(rx_jabber); CXGB_SYSCTL_ADD_QUAD(rx_short); CXGB_SYSCTL_ADD_QUAD(rx_too_long); CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs); CXGB_SYSCTL_ADD_QUAD(rx_cong_drops); CXGB_SYSCTL_ADD_QUAD(rx_frames_64); CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127); CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255); CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511); CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023); CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518); CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max); #undef CXGB_SYSCTL_ADD_QUAD #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \ CTLFLAG_RD, &mstats->a, 0) CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err); CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err); CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun); CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl); CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss); CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err); CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change); CXGB_SYSCTL_ADD_ULONG(num_toggled); CXGB_SYSCTL_ADD_ULONG(num_resets); CXGB_SYSCTL_ADD_ULONG(link_faults); #undef CXGB_SYSCTL_ADD_ULONG } } /** * t3_get_desc - dump an SGE descriptor for debugging purposes * @qs: the queue set * @qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx) * @idx: the descriptor index in the queue * @data: where to dump the descriptor contents * * Dumps the contents of a HW descriptor of an SGE queue. Returns the * size of the descriptor. */ int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx, unsigned char *data) { if (qnum >= 6) return (EINVAL); if (qnum < 3) { if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size) return -EINVAL; memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc)); return sizeof(struct tx_desc); } if (qnum == 3) { if (!qs->rspq.desc || idx >= qs->rspq.size) return (EINVAL); memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc)); return sizeof(struct rsp_desc); } qnum -= 4; if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size) return (EINVAL); memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc)); return sizeof(struct rx_desc); } Index: head/sys/dev/cxgbe/t4_main.c =================================================================== --- head/sys/dev/cxgbe/t4_main.c (revision 275357) +++ head/sys/dev/cxgbe/t4_main.c (revision 275358) @@ -1,8572 +1,8573 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_ioctl.h" #include "t4_l2t.h" /* T4 bus driver interface */ static int t4_probe(device_t); static int t4_attach(device_t); static int t4_detach(device_t); static device_method_t t4_methods[] = { DEVMETHOD(device_probe, t4_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD_END }; static driver_t t4_driver = { "t4nex", t4_methods, sizeof(struct adapter) }; /* T4 port (cxgbe) interface */ static int cxgbe_probe(device_t); static int cxgbe_attach(device_t); static int cxgbe_detach(device_t); static device_method_t cxgbe_methods[] = { DEVMETHOD(device_probe, cxgbe_probe), DEVMETHOD(device_attach, cxgbe_attach), DEVMETHOD(device_detach, cxgbe_detach), { 0, 0 } }; static driver_t cxgbe_driver = { "cxgbe", cxgbe_methods, sizeof(struct port_info) }; static d_ioctl_t t4_ioctl; static d_open_t t4_open; static d_close_t t4_close; static struct cdevsw t4_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = t4_open, .d_close = t4_close, .d_ioctl = t4_ioctl, .d_name = "t4nex", }; /* T5 bus driver interface */ static int t5_probe(device_t); static device_method_t t5_methods[] = { DEVMETHOD(device_probe, t5_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD_END }; static driver_t t5_driver = { "t5nex", t5_methods, sizeof(struct adapter) }; /* T5 port (cxl) interface */ static driver_t cxl_driver = { "cxl", cxgbe_methods, sizeof(struct port_info) }; static struct cdevsw t5_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = t4_open, .d_close = t4_close, .d_ioctl = t4_ioctl, .d_name = "t5nex", }; /* ifnet + media interface */ static void cxgbe_init(void *); static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t); static int cxgbe_transmit(struct ifnet *, struct mbuf *); static void cxgbe_qflush(struct ifnet *); static uint64_t cxgbe_get_counter(struct ifnet *, ift_counter); static int cxgbe_media_change(struct ifnet *); static void cxgbe_media_status(struct ifnet *, struct ifmediareq *); MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services"); /* * Correct lock order when you need to acquire multiple locks is t4_list_lock, * then ADAPTER_LOCK, then t4_uld_list_lock. */ static struct sx t4_list_lock; SLIST_HEAD(, adapter) t4_list; #ifdef TCP_OFFLOAD static struct sx t4_uld_list_lock; SLIST_HEAD(, uld_info) t4_uld_list; #endif /* * Tunables. See tweak_tunables() too. * * Each tunable is set to a default value here if it's known at compile-time. * Otherwise it is set to -1 as an indication to tweak_tunables() that it should * provide a reasonable default when the driver is loaded. * * Tunables applicable to both T4 and T5 are under hw.cxgbe. Those specific to * T5 are under hw.cxl. */ /* * Number of queues for tx and rx, 10G and 1G, NIC and offload. */ #define NTXQ_10G 16 static int t4_ntxq10g = -1; TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq10g); #define NRXQ_10G 8 static int t4_nrxq10g = -1; TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq10g); #define NTXQ_1G 4 static int t4_ntxq1g = -1; TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g); #define NRXQ_1G 2 static int t4_nrxq1g = -1; TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g); static int t4_rsrv_noflowq = 0; TUNABLE_INT("hw.cxgbe.rsrv_noflowq", &t4_rsrv_noflowq); #ifdef TCP_OFFLOAD #define NOFLDTXQ_10G 8 static int t4_nofldtxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g); #define NOFLDRXQ_10G 2 static int t4_nofldrxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq10g", &t4_nofldrxq10g); #define NOFLDTXQ_1G 2 static int t4_nofldtxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq1g", &t4_nofldtxq1g); #define NOFLDRXQ_1G 1 static int t4_nofldrxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq1g", &t4_nofldrxq1g); #endif #ifdef DEV_NETMAP #define NNMTXQ_10G 2 static int t4_nnmtxq10g = -1; TUNABLE_INT("hw.cxgbe.nnmtxq10g", &t4_nnmtxq10g); #define NNMRXQ_10G 2 static int t4_nnmrxq10g = -1; TUNABLE_INT("hw.cxgbe.nnmrxq10g", &t4_nnmrxq10g); #define NNMTXQ_1G 1 static int t4_nnmtxq1g = -1; TUNABLE_INT("hw.cxgbe.nnmtxq1g", &t4_nnmtxq1g); #define NNMRXQ_1G 1 static int t4_nnmrxq1g = -1; TUNABLE_INT("hw.cxgbe.nnmrxq1g", &t4_nnmrxq1g); #endif /* * Holdoff parameters for 10G and 1G ports. */ #define TMR_IDX_10G 1 static int t4_tmr_idx_10g = TMR_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx_10g); #define PKTC_IDX_10G (-1) static int t4_pktc_idx_10g = PKTC_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx_10g); #define TMR_IDX_1G 1 static int t4_tmr_idx_1g = TMR_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_1G", &t4_tmr_idx_1g); #define PKTC_IDX_1G (-1) static int t4_pktc_idx_1g = PKTC_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_1G", &t4_pktc_idx_1g); /* * Size (# of entries) of each tx and rx queue. */ static unsigned int t4_qsize_txq = TX_EQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_txq", &t4_qsize_txq); static unsigned int t4_qsize_rxq = RX_IQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_rxq", &t4_qsize_rxq); /* * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively). */ static int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX; TUNABLE_INT("hw.cxgbe.interrupt_types", &t4_intr_types); /* * Configuration file. */ #define DEFAULT_CF "default" #define FLASH_CF "flash" #define UWIRE_CF "uwire" #define FPGA_CF "fpga" static char t4_cfg_file[32] = DEFAULT_CF; TUNABLE_STR("hw.cxgbe.config_file", t4_cfg_file, sizeof(t4_cfg_file)); /* * PAUSE settings (bit 0, 1 = rx_pause, tx_pause respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. */ static int t4_pause_settings = PAUSE_TX | PAUSE_RX; TUNABLE_INT("hw.cxgbe.pause_settings", &t4_pause_settings); /* * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed, * encouraged respectively). */ static unsigned int t4_fw_install = 1; TUNABLE_INT("hw.cxgbe.fw_install", &t4_fw_install); /* * ASIC features that will be used. Disable the ones you don't want so that the * chip resources aren't wasted on features that will not be used. */ static int t4_linkcaps_allowed = 0; /* No DCBX, PPP, etc. by default */ TUNABLE_INT("hw.cxgbe.linkcaps_allowed", &t4_linkcaps_allowed); static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC; TUNABLE_INT("hw.cxgbe.niccaps_allowed", &t4_niccaps_allowed); static int t4_toecaps_allowed = -1; TUNABLE_INT("hw.cxgbe.toecaps_allowed", &t4_toecaps_allowed); static int t4_rdmacaps_allowed = 0; TUNABLE_INT("hw.cxgbe.rdmacaps_allowed", &t4_rdmacaps_allowed); static int t4_iscsicaps_allowed = 0; TUNABLE_INT("hw.cxgbe.iscsicaps_allowed", &t4_iscsicaps_allowed); static int t4_fcoecaps_allowed = 0; TUNABLE_INT("hw.cxgbe.fcoecaps_allowed", &t4_fcoecaps_allowed); static int t5_write_combine = 0; TUNABLE_INT("hw.cxl.write_combine", &t5_write_combine); struct intrs_and_queues { uint16_t intr_type; /* INTx, MSI, or MSI-X */ uint16_t nirq; /* Total # of vectors */ uint16_t intr_flags_10g;/* Interrupt flags for each 10G port */ uint16_t intr_flags_1g; /* Interrupt flags for each 1G port */ uint16_t ntxq10g; /* # of NIC txq's for each 10G port */ uint16_t nrxq10g; /* # of NIC rxq's for each 10G port */ uint16_t ntxq1g; /* # of NIC txq's for each 1G port */ uint16_t nrxq1g; /* # of NIC rxq's for each 1G port */ uint16_t rsrv_noflowq; /* Flag whether to reserve queue 0 */ #ifdef TCP_OFFLOAD uint16_t nofldtxq10g; /* # of TOE txq's for each 10G port */ uint16_t nofldrxq10g; /* # of TOE rxq's for each 10G port */ uint16_t nofldtxq1g; /* # of TOE txq's for each 1G port */ uint16_t nofldrxq1g; /* # of TOE rxq's for each 1G port */ #endif #ifdef DEV_NETMAP uint16_t nnmtxq10g; /* # of netmap txq's for each 10G port */ uint16_t nnmrxq10g; /* # of netmap rxq's for each 10G port */ uint16_t nnmtxq1g; /* # of netmap txq's for each 1G port */ uint16_t nnmrxq1g; /* # of netmap rxq's for each 1G port */ #endif }; struct filter_entry { uint32_t valid:1; /* filter allocated and valid */ uint32_t locked:1; /* filter is administratively locked */ uint32_t pending:1; /* filter action is pending firmware reply */ uint32_t smtidx:8; /* Source MAC Table index for smac */ struct l2t_entry *l2t; /* Layer Two Table entry for dmac */ struct t4_filter_specification fs; }; static int map_bars_0_and_4(struct adapter *); static int map_bar_2(struct adapter *); static void setup_memwin(struct adapter *); static int validate_mem_range(struct adapter *, uint32_t, int); static int fwmtype_to_hwmtype(int); static int validate_mt_off_len(struct adapter *, int, uint32_t, int, uint32_t *); static void memwin_info(struct adapter *, int, uint32_t *, uint32_t *); static uint32_t position_memwin(struct adapter *, int, uint32_t); static int cfg_itype_and_nqueues(struct adapter *, int, int, struct intrs_and_queues *); static int prep_firmware(struct adapter *); static int partition_resources(struct adapter *, const struct firmware *, const char *); static int get_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); static void build_medialist(struct port_info *, struct ifmedia *); static int cxgbe_init_synchronized(struct port_info *); static int cxgbe_uninit_synchronized(struct port_info *); static int setup_intr_handlers(struct adapter *); static void quiesce_eq(struct adapter *, struct sge_eq *); static void quiesce_iq(struct adapter *, struct sge_iq *); static void quiesce_fl(struct adapter *, struct sge_fl *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, driver_intr_t *, void *, char *); static int t4_free_irq(struct adapter *, struct irq *); static void reg_block_dump(struct adapter *, uint8_t *, unsigned int, unsigned int); static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void cxgbe_refresh_stats(struct adapter *, struct port_info *); static void cxgbe_tick(void *); static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t); static int cpl_not_handled(struct sge_iq *, const struct rss_header *, struct mbuf *); static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *); static int fw_msg_not_handled(struct adapter *, const __be64 *); static int t4_sysctls(struct adapter *); static int cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS); static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS); static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS); static int sysctl_temperature(SYSCTL_HANDLER_ARGS); #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS); static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_devlog(SYSCTL_HANDLER_ARGS); static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS); static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS); static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS); static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS); static int sysctl_meminfo(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS); static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS); static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS); static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tids(SYSCTL_HANDLER_ARGS); static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la(SYSCTL_HANDLER_ARGS); static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); #endif static inline void txq_start(struct ifnet *, struct sge_txq *); static uint32_t fconf_to_mode(uint32_t); static uint32_t mode_to_fconf(uint32_t); static uint32_t fspec_to_fconf(struct t4_filter_specification *); static int get_filter_mode(struct adapter *, uint32_t *); static int set_filter_mode(struct adapter *, uint32_t); static inline uint64_t get_filter_hits(struct adapter *, uint32_t); static int get_filter(struct adapter *, struct t4_filter *); static int set_filter(struct adapter *, struct t4_filter *); static int del_filter(struct adapter *, struct t4_filter *); static void clear_filter(struct filter_entry *); static int set_filter_wr(struct adapter *, int); static int del_filter_wr(struct adapter *, int); static int get_sge_context(struct adapter *, struct t4_sge_context *); static int load_fw(struct adapter *, struct t4_data *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); static int set_sched_class(struct adapter *, struct t4_sched_params *); static int set_sched_queue(struct adapter *, struct t4_sched_queue *); #ifdef TCP_OFFLOAD static int toe_capability(struct port_info *, int); #endif static int mod_event(module_t, int, void *); struct { uint16_t device; char *desc; } t4_pciids[] = { {0xa000, "Chelsio Terminator 4 FPGA"}, {0x4400, "Chelsio T440-dbg"}, {0x4401, "Chelsio T420-CR"}, {0x4402, "Chelsio T422-CR"}, {0x4403, "Chelsio T440-CR"}, {0x4404, "Chelsio T420-BCH"}, {0x4405, "Chelsio T440-BCH"}, {0x4406, "Chelsio T440-CH"}, {0x4407, "Chelsio T420-SO"}, {0x4408, "Chelsio T420-CX"}, {0x4409, "Chelsio T420-BT"}, {0x440a, "Chelsio T404-BT"}, {0x440e, "Chelsio T440-LP-CR"}, }, t5_pciids[] = { {0xb000, "Chelsio Terminator 5 FPGA"}, {0x5400, "Chelsio T580-dbg"}, {0x5401, "Chelsio T520-CR"}, /* 2 x 10G */ {0x5402, "Chelsio T522-CR"}, /* 2 x 10G, 2 X 1G */ {0x5403, "Chelsio T540-CR"}, /* 4 x 10G */ {0x5407, "Chelsio T520-SO"}, /* 2 x 10G, nomem */ {0x5409, "Chelsio T520-BT"}, /* 2 x 10GBaseT */ {0x540a, "Chelsio T504-BT"}, /* 4 x 1G */ {0x540d, "Chelsio T580-CR"}, /* 2 x 40G */ {0x540e, "Chelsio T540-LP-CR"}, /* 4 x 10G */ {0x5410, "Chelsio T580-LP-CR"}, /* 2 x 40G */ {0x5411, "Chelsio T520-LL-CR"}, /* 2 x 10G */ {0x5412, "Chelsio T560-CR"}, /* 1 x 40G, 2 x 10G */ {0x5414, "Chelsio T580-LP-SO-CR"}, /* 2 x 40G, nomem */ {0x5415, "Chelsio T502-BT"}, /* 2 x 1G */ #ifdef notyet {0x5404, "Chelsio T520-BCH"}, {0x5405, "Chelsio T540-BCH"}, {0x5406, "Chelsio T540-CH"}, {0x5408, "Chelsio T520-CX"}, {0x540b, "Chelsio B520-SR"}, {0x540c, "Chelsio B504-BT"}, {0x540f, "Chelsio Amsterdam"}, {0x5413, "Chelsio T580-CHR"}, #endif }; #ifdef TCP_OFFLOAD /* * service_iq() has an iq and needs the fl. Offset of fl from the iq should be * exactly the same for both rxq and ofld_rxq. */ CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif /* No easy way to include t4_msg.h before adapter.h so we check this way */ CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS); CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES); CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); static int t4_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xa000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t4_pciids); i++) { if (d == t4_pciids[i].device) { device_set_desc(dev, t4_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t5_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xb000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t5_pciids); i++) { if (d == t5_pciids[i].device) { device_set_desc(dev, t5_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t4_attach(device_t dev) { struct adapter *sc; int rc = 0, i, n10g, n1g, rqidx, tqidx; struct intrs_and_queues iaq; struct sge *s; #ifdef TCP_OFFLOAD int ofld_rqidx, ofld_tqidx; #endif #ifdef DEV_NETMAP int nm_rqidx, nm_tqidx; #endif const char *pcie_ts; sc = device_get_softc(dev); sc->dev = dev; pci_enable_busmaster(dev); if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) { uint32_t v; pci_set_max_read_req(dev, 4096); v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2); v |= PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5); } sc->traceq = -1; mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF); snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer", device_get_nameunit(dev)); snprintf(sc->lockname, sizeof(sc->lockname), "%s", device_get_nameunit(dev)); mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF); sx_xlock(&t4_list_lock); SLIST_INSERT_HEAD(&t4_list, sc, link); sx_xunlock(&t4_list_lock); mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF); TAILQ_INIT(&sc->sfl); callout_init(&sc->sfl_callout, CALLOUT_MPSAFE); mtx_init(&sc->regwin_lock, "register and memory window", 0, MTX_DEF); rc = map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ /* * This is the real PF# to which we're attaching. Works from within PCI * passthrough environments too, where pci_get_function() could return a * different PF# depending on the passthrough configuration. We need to * use the real PF# in all our communication with the firmware. */ sc->pf = G_SOURCEPF(t4_read_reg(sc, A_PL_WHOAMI)); sc->mbox = sc->pf; memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); sc->an_handler = an_not_handled; for (i = 0; i < nitems(sc->cpl_handler); i++) sc->cpl_handler[i] = cpl_not_handled; for (i = 0; i < nitems(sc->fw_msg_handler); i++) sc->fw_msg_handler[i] = fw_msg_not_handled; t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, t4_filter_rpl); t4_register_cpl_handler(sc, CPL_TRACE_PKT, t4_trace_pkt); t4_register_cpl_handler(sc, CPL_TRACE_PKT_T5, t5_trace_pkt); t4_init_sge_cpl_handlers(sc); /* Prepare the adapter for operation */ rc = -t4_prep_adapter(sc); if (rc != 0) { device_printf(dev, "failed to prepare adapter: %d.\n", rc); goto done; } /* * Do this really early, with the memory windows set up even before the * character device. The userland tool's register i/o and mem read * will work even in "recovery mode". */ setup_memwin(sc); sc->cdev = make_dev(is_t4(sc) ? &t4_cdevsw : &t5_cdevsw, device_get_unit(dev), UID_ROOT, GID_WHEEL, 0600, "%s", device_get_nameunit(dev)); if (sc->cdev == NULL) device_printf(dev, "failed to create nexus char device.\n"); else sc->cdev->si_drv1 = sc; /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { device_printf(dev, "recovery mode.\n"); goto done; } /* Prepare the firmware for operation */ rc = prep_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = map_bar_2(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_create_dma_tag(sc); if (rc != 0) goto done; /* error message displayed already */ /* * First pass over all the ports - allocate VIs and initialize some * basic parameters like mac address, port type, etc. We also figure * out whether a port is 10G or 1G and use that information when * calculating how many interrupts to attempt to allocate. */ n10g = n1g = 0; for_each_port(sc, i) { struct port_info *pi; pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK); sc->port[i] = pi; /* These must be set before t4_port_init */ pi->adapter = sc; pi->port_id = i; /* Allocate the vi and initialize parameters like mac addr */ rc = -t4_port_init(pi, sc->mbox, sc->pf, 0); if (rc != 0) { device_printf(dev, "unable to initialize port %d: %d\n", i, rc); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } pi->link_cfg.requested_fc &= ~(PAUSE_TX | PAUSE_RX); pi->link_cfg.requested_fc |= t4_pause_settings; pi->link_cfg.fc &= ~(PAUSE_TX | PAUSE_RX); pi->link_cfg.fc |= t4_pause_settings; rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, &pi->link_cfg); if (rc != 0) { device_printf(dev, "port %d l1cfg failed: %d\n", i, rc); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF); sc->chan_map[pi->tx_chan] = i; if (is_10G_port(pi) || is_40G_port(pi)) { n10g++; pi->tmr_idx = t4_tmr_idx_10g; pi->pktc_idx = t4_pktc_idx_10g; } else { n1g++; pi->tmr_idx = t4_tmr_idx_1g; pi->pktc_idx = t4_pktc_idx_1g; } pi->xact_addr_filt = -1; pi->linkdnrc = -1; pi->qsize_rxq = t4_qsize_rxq; pi->qsize_txq = t4_qsize_txq; pi->dev = device_add_child(dev, is_t4(sc) ? "cxgbe" : "cxl", -1); if (pi->dev == NULL) { device_printf(dev, "failed to add device for port %d.\n", i); rc = ENXIO; goto done; } device_set_softc(pi->dev, pi); } /* * Interrupt type, # of interrupts, # of rx/tx queues, etc. */ rc = cfg_itype_and_nqueues(sc, n10g, n1g, &iaq); if (rc != 0) goto done; /* error message displayed already */ sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; s = &sc->sge; s->nrxq = n10g * iaq.nrxq10g + n1g * iaq.nrxq1g; s->ntxq = n10g * iaq.ntxq10g + n1g * iaq.ntxq1g; s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ #ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g; s->nofldtxq = n10g * iaq.nofldtxq10g + n1g * iaq.nofldtxq1g; s->neq += s->nofldtxq + s->nofldrxq; s->niq += s->nofldrxq; s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef DEV_NETMAP s->nnmrxq = n10g * iaq.nnmrxq10g + n1g * iaq.nnmrxq1g; s->nnmtxq = n10g * iaq.nnmtxq10g + n1g * iaq.nnmtxq1g; s->neq += s->nnmtxq + s->nnmrxq; s->niq += s->nnmrxq; s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), M_CXGBE, M_ZERO | M_WAITOK); #endif s->ctrlq = malloc(sc->params.nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE, M_ZERO | M_WAITOK); s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE, M_ZERO | M_WAITOK); s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE, M_ZERO | M_WAITOK); sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE, M_ZERO | M_WAITOK); t4_init_l2t(sc, M_WAITOK); /* * Second pass over the ports. This time we know the number of rx and * tx queues that each port should get. */ rqidx = tqidx = 0; #ifdef TCP_OFFLOAD ofld_rqidx = ofld_tqidx = 0; #endif #ifdef DEV_NETMAP nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; if (pi == NULL) continue; pi->first_rxq = rqidx; pi->first_txq = tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { pi->flags |= iaq.intr_flags_10g; pi->nrxq = iaq.nrxq10g; pi->ntxq = iaq.ntxq10g; } else { pi->flags |= iaq.intr_flags_1g; pi->nrxq = iaq.nrxq1g; pi->ntxq = iaq.ntxq1g; } if (pi->ntxq > 1) pi->rsrv_noflowq = iaq.rsrv_noflowq ? 1 : 0; else pi->rsrv_noflowq = 0; rqidx += pi->nrxq; tqidx += pi->ntxq; #ifdef TCP_OFFLOAD if (is_offload(sc)) { pi->first_ofld_rxq = ofld_rqidx; pi->first_ofld_txq = ofld_tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { pi->nofldrxq = iaq.nofldrxq10g; pi->nofldtxq = iaq.nofldtxq10g; } else { pi->nofldrxq = iaq.nofldrxq1g; pi->nofldtxq = iaq.nofldtxq1g; } ofld_rqidx += pi->nofldrxq; ofld_tqidx += pi->nofldtxq; } #endif #ifdef DEV_NETMAP pi->first_nm_rxq = nm_rqidx; pi->first_nm_txq = nm_tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { pi->nnmrxq = iaq.nnmrxq10g; pi->nnmtxq = iaq.nnmtxq10g; } else { pi->nnmrxq = iaq.nnmrxq1g; pi->nnmtxq = iaq.nnmtxq1g; } nm_rqidx += pi->nnmrxq; nm_tqidx += pi->nnmtxq; #endif } rc = setup_intr_handlers(sc); if (rc != 0) { device_printf(dev, "failed to setup interrupt handlers: %d\n", rc); goto done; } rc = bus_generic_attach(dev); if (rc != 0) { device_printf(dev, "failed to attach all child ports: %d\n", rc); goto done; } switch (sc->params.pci.speed) { case 0x1: pcie_ts = "2.5"; break; case 0x2: pcie_ts = "5.0"; break; case 0x3: pcie_ts = "8.0"; break; default: pcie_ts = "??"; break; } device_printf(dev, "PCIe x%d (%s GTS/s) (%d), %d ports, %d %s interrupt%s, %d eq, %d iq\n", sc->params.pci.width, pcie_ts, sc->params.pci.speed, sc->params.nports, sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" : (sc->intr_type == INTR_MSI ? "MSI" : "INTx"), sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq); t4_set_desc(sc); done: if (rc != 0 && sc->cdev) { /* cdev was created and so cxgbetool works; recover that way. */ device_printf(dev, "error during attach, adapter is now in recovery mode.\n"); rc = 0; } if (rc != 0) t4_detach(dev); else t4_sysctls(sc); return (rc); } /* * Idempotent */ static int t4_detach(device_t dev) { struct adapter *sc; struct port_info *pi; int i, rc; sc = device_get_softc(dev); if (sc->flags & FULL_INIT_DONE) t4_intr_disable(sc); if (sc->cdev) { destroy_dev(sc->cdev); sc->cdev = NULL; } rc = bus_generic_detach(dev); if (rc) { device_printf(dev, "failed to detach child devices: %d\n", rc); return (rc); } for (i = 0; i < sc->intr_count; i++) t4_free_irq(sc, &sc->irq[i]); for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->viid); if (pi->dev) device_delete_child(dev, pi->dev); mtx_destroy(&pi->pi_lock); free(pi, M_CXGBE); } } if (sc->flags & FULL_INIT_DONE) adapter_full_uninit(sc); if (sc->flags & FW_OK) t4_fw_bye(sc, sc->mbox); if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX) pci_release_msi(dev); if (sc->regs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); if (sc->udbs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->msix_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid, sc->msix_res); if (sc->l2t) t4_free_l2t(sc->l2t); #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); free(sc->sge.ofld_txq, M_CXGBE); #endif #ifdef DEV_NETMAP free(sc->sge.nm_rxq, M_CXGBE); free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); free(sc->sge.txq, M_CXGBE); free(sc->sge.ctrlq, M_CXGBE); free(sc->sge.iqmap, M_CXGBE); free(sc->sge.eqmap, M_CXGBE); free(sc->tids.ftid_tab, M_CXGBE); t4_destroy_dma_tag(sc); if (mtx_initialized(&sc->sc_lock)) { sx_xlock(&t4_list_lock); SLIST_REMOVE(&t4_list, sc, adapter, link); sx_xunlock(&t4_list_lock); mtx_destroy(&sc->sc_lock); } if (mtx_initialized(&sc->tids.ftid_lock)) mtx_destroy(&sc->tids.ftid_lock); if (mtx_initialized(&sc->sfl_lock)) mtx_destroy(&sc->sfl_lock); if (mtx_initialized(&sc->ifp_lock)) mtx_destroy(&sc->ifp_lock); if (mtx_initialized(&sc->regwin_lock)) mtx_destroy(&sc->regwin_lock); bzero(sc, sizeof(*sc)); return (0); } static int cxgbe_probe(device_t dev) { char buf[128]; struct port_info *pi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d", pi->port_id); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS) #define T4_CAP_ENABLE (T4_CAP) static int cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct ifnet *ifp; char *s; int n, o; /* Allocate an ifnet and set it up */ ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } pi->ifp = ifp; ifp->if_softc = pi; callout_init(&pi->tick, CALLOUT_MPSAFE); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = cxgbe_init; ifp->if_ioctl = cxgbe_ioctl; ifp->if_transmit = cxgbe_transmit; ifp->if_qflush = cxgbe_qflush; ifp->if_get_counter = cxgbe_get_counter; ifp->if_capabilities = T4_CAP; #ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) ifp->if_capabilities |= IFCAP_TOE; #endif ifp->if_capenable = T4_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS; ifp->if_hw_tsomaxsegsize = 65536; /* Initialize ifmedia for this port */ ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); build_medialist(pi, &pi->media); pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp, EVENTHANDLER_PRI_ANY); ether_ifattach(ifp, pi->hw_addr); n = 128; s = malloc(n, M_CXGBE, M_WAITOK); o = snprintf(s, n, "%d txq, %d rxq (NIC)", pi->ntxq, pi->nrxq); MPASS(n > o); #ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) { o += snprintf(s + o, n - o, "; %d txq, %d rxq (TOE)", pi->nofldtxq, pi->nofldrxq); MPASS(n > o); } #endif #ifdef DEV_NETMAP o += snprintf(s + o, n - o, "; %d txq, %d rxq (netmap)", pi->nnmtxq, pi->nnmrxq); MPASS(n > o); #endif device_printf(dev, "%s\n", s); free(s, M_CXGBE); #ifdef DEV_NETMAP /* nm_media handled here to keep implementation private to this file */ ifmedia_init(&pi->nm_media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); build_medialist(pi, &pi->nm_media); create_netmap_ifnet(pi); /* logs errors it something fails */ #endif cxgbe_sysctls(pi); return (0); } static int cxgbe_detach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; /* Tell if_ioctl and if_init that the port is going away */ ADAPTER_LOCK(sc); SET_DOOMED(pi); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = "t4detach"; sc->last_op_thr = curthread; #endif ADAPTER_UNLOCK(sc); if (pi->flags & HAS_TRACEQ) { sc->traceq = -1; /* cloner should not create ifnet */ t4_tracer_port_detach(sc); } if (pi->vlan_c) EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c); PORT_LOCK(pi); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&pi->tick); PORT_UNLOCK(pi); callout_drain(&pi->tick); /* Let detach proceed even if these fail. */ cxgbe_uninit_synchronized(pi); port_full_uninit(pi); ifmedia_removeall(&pi->media); ether_ifdetach(pi->ifp); if_free(pi->ifp); #ifdef DEV_NETMAP /* XXXNM: equivalent of cxgbe_uninit_synchronized to ifdown nm_ifp */ destroy_netmap_ifnet(pi); #endif ADAPTER_LOCK(sc); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); return (0); } static void cxgbe_init(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; if (begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4init") != 0) return; cxgbe_init_synchronized(pi); end_synchronized_op(sc, 0); } static int cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) { int rc = 0, mtu, flags, can_sleep; struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; struct ifreq *ifr = (struct ifreq *)data; uint32_t mask; switch (cmd) { case SIOCSIFMTU: mtu = ifr->ifr_mtu; if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO)) return (EINVAL); rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4mtu"); if (rc) return (rc); ifp->if_mtu = mtu; if (pi->flags & PORT_INIT_DONE) { t4_update_fl_bufsize(ifp); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: can_sleep = 0; redo_sifflags: rc = begin_synchronized_op(sc, pi, can_sleep ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4flg"); if (rc) return (rc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { flags = pi->if_flags; if ((ifp->if_flags ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { if (can_sleep == 1) { end_synchronized_op(sc, 0); can_sleep = 0; goto redo_sifflags; } rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } } else { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_init_synchronized(pi); } pi->if_flags = ifp->if_flags; } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_uninit_synchronized(pi); } end_synchronized_op(sc, can_sleep ? 0 : LOCK_HELD); break; case SIOCADDMULTI: case SIOCDELMULTI: /* these two are called with a mutex held :-( */ rc = begin_synchronized_op(sc, pi, HOLD_LOCK, "t4multi"); if (rc) return (rc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, LOCK_HELD); break; case SIOCSIFCAP: rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4cap"); if (rc) return (rc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO4; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO6; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { if_printf(ifp, "enable txcsum first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO4; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { if_printf(ifp, "enable txcsum6 first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO6; } if (mask & IFCAP_LRO) { #if defined(INET) || defined(INET6) int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_LRO; for_each_rxq(pi, i, rxq) { if (ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; else rxq->iq.flags &= ~IQ_LRO_ENABLED; } #endif } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; rc = toe_capability(pi, enable); if (rc != 0) goto fail; ifp->if_capenable ^= mask; } #endif if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; /* Need to find out how to disable auto-mtu-inflation */ } if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWCSUM) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif fail: end_synchronized_op(sc, 0); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: ifmedia_ioctl(ifp, ifr, &pi->media, cmd); break; case SIOCGI2C: { struct ifi2creq i2c; rc = copyin(ifr->ifr_data, &i2c, sizeof(i2c)); if (rc != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { rc = EPERM; break; } if (i2c.len > sizeof(i2c.data)) { rc = EINVAL; break; } rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4i2c"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr, i2c.offset, i2c.len, &i2c.data[0]); end_synchronized_op(sc, 0); if (rc == 0) rc = copyout(&i2c, ifr->ifr_data, sizeof(i2c)); break; } default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; struct sge_txq *txq = &sc->sge.txq[pi->first_txq]; struct buf_ring *br; int rc; M_ASSERTPKTHDR(m); if (__predict_false(pi->link_cfg.link_ok == 0)) { m_freem(m); return (ENETDOWN); } - if (m->m_flags & M_FLOWID) + /* check if flowid is set */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) + pi->rsrv_noflowq); br = txq->br; if (TXQ_TRYLOCK(txq) == 0) { struct sge_eq *eq = &txq->eq; /* * It is possible that t4_eth_tx finishes up and releases the * lock between the TRYLOCK above and the drbr_enqueue here. We * need to make sure that this mbuf doesn't just sit there in * the drbr. */ rc = drbr_enqueue(ifp, br, m); if (rc == 0 && callout_pending(&eq->tx_callout) == 0 && !(eq->flags & EQ_DOOMED)) callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); return (rc); } /* * txq->m is the mbuf that is held up due to a temporary shortage of * resources and it should be put on the wire first. Then what's in * drbr and finally the mbuf that was just passed in to us. * * Return code should indicate the fate of the mbuf that was passed in * this time. */ TXQ_LOCK_ASSERT_OWNED(txq); if (drbr_needs_enqueue(ifp, br) || txq->m) { /* Queued for transmission. */ rc = drbr_enqueue(ifp, br, m); m = txq->m ? txq->m : drbr_dequeue(ifp, br); (void) t4_eth_tx(ifp, txq, m); TXQ_UNLOCK(txq); return (rc); } /* Direct transmission. */ rc = t4_eth_tx(ifp, txq, m); if (rc != 0 && txq->m) rc = 0; /* held, will be transmitted soon (hopefully) */ TXQ_UNLOCK(txq); return (rc); } static void cxgbe_qflush(struct ifnet *ifp) { struct port_info *pi = ifp->if_softc; struct sge_txq *txq; int i; struct mbuf *m; /* queues do not exist if !PORT_INIT_DONE. */ if (pi->flags & PORT_INIT_DONE) { for_each_txq(pi, i, txq) { TXQ_LOCK(txq); m_freem(txq->m); txq->m = NULL; while ((m = buf_ring_dequeue_sc(txq->br)) != NULL) m_freem(m); TXQ_UNLOCK(txq); } } if_qflush(ifp); } static uint64_t cxgbe_get_counter(struct ifnet *ifp, ift_counter c) { struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; struct port_stats *s = &pi->stats; cxgbe_refresh_stats(sc, pi); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_frames - s->rx_pause); case IFCOUNTER_IERRORS: return (s->rx_jabber + s->rx_runt + s->rx_too_long + s->rx_fcs_err + s->rx_len_err); case IFCOUNTER_OPACKETS: return (s->tx_frames - s->tx_pause); case IFCOUNTER_OERRORS: return (s->tx_error_frames); case IFCOUNTER_IBYTES: return (s->rx_octets - s->rx_pause * 64); case IFCOUNTER_OBYTES: return (s->tx_octets - s->tx_pause * 64); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames - s->rx_pause); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames - s->tx_pause); case IFCOUNTER_IQDROPS: return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3 + pi->tnl_cong_drops); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = s->tx_drop; if (pi->flags & PORT_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(pi, i, txq) drops += txq->br->br_drops; } return (drops); } default: return (if_get_counter_default(ifp, c)); } } static int cxgbe_media_change(struct ifnet *ifp) { struct port_info *pi = ifp->if_softc; device_printf(pi->dev, "%s unimplemented.\n", __func__); return (EOPNOTSUPP); } static void cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct port_info *pi = ifp->if_softc; struct ifmedia *media = NULL; struct ifmedia_entry *cur; int speed = pi->link_cfg.speed; #ifdef INVARIANTS int data = (pi->port_type << 8) | pi->mod_type; #endif if (ifp == pi->ifp) media = &pi->media; #ifdef DEV_NETMAP else if (ifp == pi->nm_ifp) media = &pi->nm_media; #endif MPASS(media != NULL); cur = media->ifm_cur; MPASS(cur->ifm_data == data); ifmr->ifm_status = IFM_AVALID; if (!pi->link_cfg.link_ok) return; ifmr->ifm_status |= IFM_ACTIVE; /* active and current will differ iff current media is autoselect. */ if (IFM_SUBTYPE(cur->ifm_media) != IFM_AUTO) return; ifmr->ifm_active = IFM_ETHER | IFM_FDX; if (speed == SPEED_10000) ifmr->ifm_active |= IFM_10G_T; else if (speed == SPEED_1000) ifmr->ifm_active |= IFM_1000_T; else if (speed == SPEED_100) ifmr->ifm_active |= IFM_100_TX; else if (speed == SPEED_10) ifmr->ifm_active |= IFM_10_T; else KASSERT(0, ("%s: link up but speed unknown (%u)", __func__, speed)); } void t4_fatal_err(struct adapter *sc) { t4_set_reg_field(sc, A_SGE_CONTROL, F_GLOBALENABLE, 0); t4_intr_disable(sc); log(LOG_EMERG, "%s: encountered fatal error, adapter stopped.\n", device_get_nameunit(sc->dev)); } static int map_bars_0_and_4(struct adapter *sc) { sc->regs_rid = PCIR_BAR(0); sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE); if (sc->regs_res == NULL) { device_printf(sc->dev, "cannot map registers.\n"); return (ENXIO); } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); setbit(&sc->doorbells, DOORBELL_KDB); sc->msix_rid = PCIR_BAR(4); sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->msix_rid, RF_ACTIVE); if (sc->msix_res == NULL) { device_printf(sc->dev, "cannot map MSI-X BAR.\n"); return (ENXIO); } return (0); } static int map_bar_2(struct adapter *sc) { /* * T4: only iWARP driver uses the userspace doorbells. There is no need * to map it if RDMA is disabled. */ if (is_t4(sc) && sc->rdmacaps == 0) return (0); sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE); if (sc->udbs_res == NULL) { device_printf(sc->dev, "cannot map doorbell BAR.\n"); return (ENXIO); } sc->udbs_base = rman_get_virtual(sc->udbs_res); if (is_t5(sc)) { setbit(&sc->doorbells, DOORBELL_UDB); #if defined(__i386__) || defined(__amd64__) if (t5_write_combine) { int rc; /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B * (UDBS_SEG_SIZE) doorbell regions, each associated * with an egress queue. The first 64B has the doorbell * and the second 64B can be used to submit a tx work * request with an implicit doorbell. */ rc = pmap_change_attr((vm_offset_t)sc->udbs_base, rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING); if (rc == 0) { clrbit(&sc->doorbells, DOORBELL_UDB); setbit(&sc->doorbells, DOORBELL_WCWR); setbit(&sc->doorbells, DOORBELL_UDBWC); } else { device_printf(sc->dev, "couldn't enable write combining: %d\n", rc); } t4_write_reg(sc, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) | V_STATMODE(0)); } #endif } return (0); } static const struct memwin t4_memwin[] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 } }; static const struct memwin t5_memwin[] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 }, }; static void setup_memwin(struct adapter *sc) { const struct memwin *mw; int i, n; uint32_t bar0; if (is_t4(sc)) { /* * Read low 32b of bar0 indirectly via the hardware backdoor * mechanism. Works from within PCI passthrough environments * too, where rman_get_start() can return a different value. We * need to program the T4 memory window decoders with the actual * addresses that will be coming across the PCIe link. */ bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0)); bar0 &= (uint32_t) PCIM_BAR_MEM_BASE; mw = &t4_memwin[0]; n = nitems(t4_memwin); } else { /* T5 uses the relative offset inside the PCIe BAR */ bar0 = 0; mw = &t5_memwin[0]; n = nitems(t5_memwin); } for (i = 0; i < n; i++, mw++) { t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i), (mw->base + bar0) | V_BIR(0) | V_WINDOW(ilog2(mw->aperture) - 10)); } /* flush */ t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2)); } /* * Verify that the memory range specified by the addr/len pair is valid and lies * entirely within a single region (EDCx or MCx). */ static int validate_mem_range(struct adapter *sc, uint32_t addr, int len) { uint32_t em, addr_len, maddr, mlen; /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len == 0) return (EINVAL); /* Enabled memories */ em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (em & F_EDRAM0_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; mlen = G_EDRAM0_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } if (em & F_EDRAM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; mlen = G_EDRAM1_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } if (em & F_EXT_MEM_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; mlen = G_EXT_MEM_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } if (!is_t4(sc) && em & F_EXT_MEM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; mlen = G_EXT_MEM1_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } return (EFAULT); } static int fwmtype_to_hwmtype(int mtype) { switch (mtype) { case FW_MEMTYPE_EDC0: return (MEM_EDC0); case FW_MEMTYPE_EDC1: return (MEM_EDC1); case FW_MEMTYPE_EXTMEM: return (MEM_MC0); case FW_MEMTYPE_EXTMEM1: return (MEM_MC1); default: panic("%s: cannot translate fw mtype %d.", __func__, mtype); } } /* * Verify that the memory range specified by the memtype/offset/len pair is * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ static int validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len, uint32_t *addr) { uint32_t em, addr_len, maddr, mlen; /* Memory can only be accessed in naturally aligned 4 byte units */ if (off & 3 || len & 3 || len == 0) return (EINVAL); em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); switch (fwmtype_to_hwmtype(mtype)) { case MEM_EDC0: if (!(em & F_EDRAM0_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; mlen = G_EDRAM0_SIZE(addr_len) << 20; break; case MEM_EDC1: if (!(em & F_EDRAM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; mlen = G_EDRAM1_SIZE(addr_len) << 20; break; case MEM_MC: if (!(em & F_EXT_MEM_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; mlen = G_EXT_MEM_SIZE(addr_len) << 20; break; case MEM_MC1: if (is_t4(sc) || !(em & F_EXT_MEM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; mlen = G_EXT_MEM1_SIZE(addr_len) << 20; break; default: return (EINVAL); } if (mlen > 0 && off < mlen && off + len <= mlen) { *addr = maddr + off; /* global address */ return (0); } return (EFAULT); } static void memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture) { const struct memwin *mw; if (is_t4(sc)) { KASSERT(win >= 0 && win < nitems(t4_memwin), ("%s: incorrect memwin# (%d)", __func__, win)); mw = &t4_memwin[win]; } else { KASSERT(win >= 0 && win < nitems(t5_memwin), ("%s: incorrect memwin# (%d)", __func__, win)); mw = &t5_memwin[win]; } if (base != NULL) *base = mw->base; if (aperture != NULL) *aperture = mw->aperture; } /* * Positions the memory window such that it can be used to access the specified * address in the chip's address space. The return value is the offset of addr * from the start of the window. */ static uint32_t position_memwin(struct adapter *sc, int n, uint32_t addr) { uint32_t start, pf; uint32_t reg; KASSERT(n >= 0 && n <= 3, ("%s: invalid window %d.", __func__, n)); KASSERT((addr & 3) == 0, ("%s: addr (0x%x) is not at a 4B boundary.", __func__, addr)); if (is_t4(sc)) { pf = 0; start = addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); start = addr & ~0x7f; /* start must be 128B aligned */ } reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, n); t4_write_reg(sc, reg, start | pf); t4_read_reg(sc, reg); return (addr - start); } static int cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, struct intrs_and_queues *iaq) { int rc, itype, navail, nrxq10g, nrxq1g, n; int nofldrxq10g = 0, nofldrxq1g = 0; int nnmrxq10g = 0, nnmrxq1g = 0; bzero(iaq, sizeof(*iaq)); iaq->ntxq10g = t4_ntxq10g; iaq->ntxq1g = t4_ntxq1g; iaq->nrxq10g = nrxq10g = t4_nrxq10g; iaq->nrxq1g = nrxq1g = t4_nrxq1g; iaq->rsrv_noflowq = t4_rsrv_noflowq; #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldtxq10g = t4_nofldtxq10g; iaq->nofldtxq1g = t4_nofldtxq1g; iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g; iaq->nofldrxq1g = nofldrxq1g = t4_nofldrxq1g; } #endif #ifdef DEV_NETMAP iaq->nnmtxq10g = t4_nnmtxq10g; iaq->nnmtxq1g = t4_nnmtxq1g; iaq->nnmrxq10g = nnmrxq10g = t4_nnmrxq10g; iaq->nnmrxq1g = nnmrxq1g = t4_nnmrxq1g; #endif for (itype = INTR_MSIX; itype; itype >>= 1) { if ((itype & t4_intr_types) == 0) continue; /* not allowed */ if (itype == INTR_MSIX) navail = pci_msix_count(sc->dev); else if (itype == INTR_MSI) navail = pci_msi_count(sc->dev); else navail = 1; restart: if (navail == 0) continue; iaq->intr_type = itype; iaq->intr_flags_10g = 0; iaq->intr_flags_1g = 0; /* * Best option: an interrupt vector for errors, one for the * firmware event queue, and one for every rxq (NIC, TOE, and * netmap). */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g * (nrxq10g + nofldrxq10g + nnmrxq10g); iaq->nirq += n1g * (nrxq1g + nofldrxq1g + nnmrxq1g); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { iaq->intr_flags_10g = INTR_ALL; iaq->intr_flags_1g = INTR_ALL; goto allocate; } /* * Second best option: a vector for errors, one for the firmware * event queue, and vectors for either all the NIC rx queues or * all the TOE rx queues. The queues that don't get vectors * will forward their interrupts to those that do. * * Note: netmap rx queues cannot be created early and so they * can't be setup to receive forwarded interrupts for others. */ iaq->nirq = T4_EXTRA_INTR; if (nrxq10g >= nofldrxq10g) { iaq->intr_flags_10g = INTR_RXQ; iaq->nirq += n10g * nrxq10g; #ifdef DEV_NETMAP iaq->nnmrxq10g = min(nnmrxq10g, nrxq10g); #endif } else { iaq->intr_flags_10g = INTR_OFLD_RXQ; iaq->nirq += n10g * nofldrxq10g; #ifdef DEV_NETMAP iaq->nnmrxq10g = min(nnmrxq10g, nofldrxq10g); #endif } if (nrxq1g >= nofldrxq1g) { iaq->intr_flags_1g = INTR_RXQ; iaq->nirq += n1g * nrxq1g; #ifdef DEV_NETMAP iaq->nnmrxq1g = min(nnmrxq1g, nrxq1g); #endif } else { iaq->intr_flags_1g = INTR_OFLD_RXQ; iaq->nirq += n1g * nofldrxq1g; #ifdef DEV_NETMAP iaq->nnmrxq1g = min(nnmrxq1g, nofldrxq1g); #endif } if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) goto allocate; /* * Next best option: an interrupt vector for errors, one for the * firmware event queue, and at least one per port. At this * point we know we'll have to downsize nrxq and/or nofldrxq * and/or nnmrxq to fit what's available to us. */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g + n1g; if (iaq->nirq <= navail) { int leftover = navail - iaq->nirq; if (n10g > 0) { int target = max(nrxq10g, nofldrxq10g); iaq->intr_flags_10g = nrxq10g >= nofldrxq10g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n10g) { leftover -= n10g; iaq->nirq += n10g; n++; } iaq->nrxq10g = min(n, nrxq10g); #ifdef TCP_OFFLOAD iaq->nofldrxq10g = min(n, nofldrxq10g); #endif #ifdef DEV_NETMAP iaq->nnmrxq10g = min(n, nnmrxq10g); #endif } if (n1g > 0) { int target = max(nrxq1g, nofldrxq1g); iaq->intr_flags_1g = nrxq1g >= nofldrxq1g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n1g) { leftover -= n1g; iaq->nirq += n1g; n++; } iaq->nrxq1g = min(n, nrxq1g); #ifdef TCP_OFFLOAD iaq->nofldrxq1g = min(n, nofldrxq1g); #endif #ifdef DEV_NETMAP iaq->nnmrxq1g = min(n, nnmrxq1g); #endif } if (itype != INTR_MSI || powerof2(iaq->nirq)) goto allocate; } /* * Least desirable option: one interrupt vector for everything. */ iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; iaq->intr_flags_10g = iaq->intr_flags_1g = 0; #ifdef TCP_OFFLOAD if (is_offload(sc)) iaq->nofldrxq10g = iaq->nofldrxq1g = 1; #endif #ifdef DEV_NETMAP iaq->nnmrxq10g = iaq->nnmrxq1g = 1; #endif allocate: navail = iaq->nirq; rc = 0; if (itype == INTR_MSIX) rc = pci_alloc_msix(sc->dev, &navail); else if (itype == INTR_MSI) rc = pci_alloc_msi(sc->dev, &navail); if (rc == 0) { if (navail == iaq->nirq) return (0); /* * Didn't get the number requested. Use whatever number * the kernel is willing to allocate (it's in navail). */ device_printf(sc->dev, "fewer vectors than requested, " "type=%d, req=%d, rcvd=%d; will downshift req.\n", itype, iaq->nirq, navail); pci_release_msi(sc->dev); goto restart; } device_printf(sc->dev, "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n", itype, rc, iaq->nirq, navail); } device_printf(sc->dev, "failed to find a usable interrupt type. " "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types, pci_msix_count(sc->dev), pci_msi_count(sc->dev)); return (ENXIO); } #define FW_VERSION(chip) ( \ V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \ V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \ V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \ V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD)) #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf) struct fw_info { uint8_t chip; char *kld_name; char *fw_mod_name; struct fw_hdr fw_hdr; /* XXX: waste of space, need a sparse struct */ } fw_info[] = { { .chip = CHELSIO_T4, .kld_name = "t4fw_cfg", .fw_mod_name = "t4fw", .fw_hdr = { .chip = FW_HDR_CHIP_T4, .fw_ver = htobe32_const(FW_VERSION(T4)), .intfver_nic = FW_INTFVER(T4, NIC), .intfver_vnic = FW_INTFVER(T4, VNIC), .intfver_ofld = FW_INTFVER(T4, OFLD), .intfver_ri = FW_INTFVER(T4, RI), .intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T4, ISCSI), .intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU), .intfver_fcoe = FW_INTFVER(T4, FCOE), }, }, { .chip = CHELSIO_T5, .kld_name = "t5fw_cfg", .fw_mod_name = "t5fw", .fw_hdr = { .chip = FW_HDR_CHIP_T5, .fw_ver = htobe32_const(FW_VERSION(T5)), .intfver_nic = FW_INTFVER(T5, NIC), .intfver_vnic = FW_INTFVER(T5, VNIC), .intfver_ofld = FW_INTFVER(T5, OFLD), .intfver_ri = FW_INTFVER(T5, RI), .intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T5, ISCSI), .intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU), .intfver_fcoe = FW_INTFVER(T5, FCOE), }, } }; static struct fw_info * find_fw_info(int chip) { int i; for (i = 0; i < nitems(fw_info); i++) { if (fw_info[i].chip == chip) return (&fw_info[i]); } return (NULL); } /* * Is the given firmware API compatible with the one the driver was compiled * with? */ static int fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2) { /* short circuit if it's the exact same firmware version */ if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) return (1); /* * XXX: Is this too conservative? Perhaps I should limit this to the * features that are supported in the driver. */ #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) && SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe)) return (1); #undef SAME_INTF return (0); } /* * The firmware in the KLD is usable, but should it be installed? This routine * explains itself in detail if it indicates the KLD firmware should be * installed. */ static int should_install_kld_fw(struct adapter *sc, int card_fw_usable, int k, int c) { const char *reason; if (!card_fw_usable) { reason = "incompatible or unusable"; goto install; } if (k > c) { reason = "older than the version bundled with this driver"; goto install; } if (t4_fw_install == 2 && k != c) { reason = "different than the version bundled with this driver"; goto install; } return (0); install: if (t4_fw_install == 0) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver is prohibited from installing a different " "firmware on the card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); return (0); } device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "installing firmware %u.%u.%u.%u on card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason, G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); return (1); } /* * Establish contact with the firmware and determine if we are the master driver * or not, and whether we are responsible for chip initialization. */ static int prep_firmware(struct adapter *sc) { const struct firmware *fw = NULL, *default_cfg; int rc, pf, card_fw_usable, kld_fw_usable, need_fw_reset = 1; enum dev_state state; struct fw_info *fw_info; struct fw_hdr *card_fw; /* fw on the card */ const struct fw_hdr *kld_fw; /* fw in the KLD */ const struct fw_hdr *drv_fw; /* fw header the driver was compiled against */ /* Contact firmware. */ rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state); if (rc < 0 || state == DEV_STATE_ERR) { rc = -rc; device_printf(sc->dev, "failed to connect to the firmware: %d, %d.\n", rc, state); return (rc); } pf = rc; if (pf == sc->mbox) sc->flags |= MASTER_PF; else if (state == DEV_STATE_UNINIT) { /* * We didn't get to be the master so we definitely won't be * configuring the chip. It's a bug if someone else hasn't * configured it already. */ device_printf(sc->dev, "couldn't be master(%d), " "device not already initialized either(%d).\n", rc, state); return (EDOOFUS); } /* This is the firmware whose headers the driver was compiled against */ fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } drv_fw = &fw_info->fw_hdr; /* * The firmware KLD contains many modules. The KLD name is also the * name of the module that contains the default config file. */ default_cfg = firmware_get(fw_info->kld_name); /* Read the header of the firmware on the card */ card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_read_flash(sc, FLASH_FW_START, sizeof (*card_fw) / sizeof (uint32_t), (uint32_t *)card_fw, 1); if (rc == 0) card_fw_usable = fw_compatible(drv_fw, (const void*)card_fw); else { device_printf(sc->dev, "Unable to read card's firmware header: %d\n", rc); card_fw_usable = 0; } /* This is the firmware in the KLD */ fw = firmware_get(fw_info->fw_mod_name); if (fw != NULL) { kld_fw = (const void *)fw->data; kld_fw_usable = fw_compatible(drv_fw, kld_fw); } else { kld_fw = NULL; kld_fw_usable = 0; } if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver && (!kld_fw_usable || kld_fw->fw_ver == drv_fw->fw_ver)) { /* * Common case: the firmware on the card is an exact match and * the KLD is an exact match too, or the KLD is * absent/incompatible. Note that t4_fw_install = 2 is ignored * here -- use cxgbetool loadfw if you want to reinstall the * same firmware as the one on the card. */ } else if (kld_fw_usable && state == DEV_STATE_UNINIT && should_install_kld_fw(sc, card_fw_usable, be32toh(kld_fw->fw_ver), be32toh(card_fw->fw_ver))) { rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0); if (rc != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", rc); goto done; } /* Installed successfully, update the cached header too. */ memcpy(card_fw, kld_fw, sizeof(*card_fw)); card_fw_usable = 1; need_fw_reset = 0; /* already reset as part of load_fw */ } if (!card_fw_usable) { uint32_t d, c, k; d = ntohl(drv_fw->fw_ver); c = ntohl(card_fw->fw_ver); k = kld_fw ? ntohl(kld_fw->fw_ver) : 0; device_printf(sc->dev, "Cannot find a usable firmware: " "fw_install %d, chip state %d, " "driver compiled with %d.%d.%d.%d, " "card has %d.%d.%d.%d, KLD has %d.%d.%d.%d\n", t4_fw_install, state, G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d), G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); rc = EINVAL; goto done; } /* We're using whatever's on the card and it's known to be good. */ sc->params.fw_vers = ntohl(card_fw->fw_ver); snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); t4_get_tp_version(sc, &sc->params.tp_vers); /* Reset device */ if (need_fw_reset && (rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST)) != 0) { device_printf(sc->dev, "firmware reset failed: %d.\n", rc); if (rc != ETIMEDOUT && rc != EIO) t4_fw_bye(sc, sc->mbox); goto done; } sc->flags |= FW_OK; rc = get_params__pre_init(sc); if (rc != 0) goto done; /* error message displayed already */ /* Partition adapter resources as specified in the config file. */ if (state == DEV_STATE_UNINIT) { KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); rc = partition_resources(sc, default_cfg, fw_info->kld_name); if (rc != 0) goto done; /* error message displayed already */ t4_tweak_chip_settings(sc); /* get basic stuff going */ rc = -t4_fw_initialize(sc, sc->mbox); if (rc != 0) { device_printf(sc->dev, "fw init failed: %d.\n", rc); goto done; } } else { snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", pf); sc->cfcsum = 0; } done: free(card_fw, M_CXGBE); if (fw != NULL) firmware_put(fw, FIRMWARE_UNLOAD); if (default_cfg != NULL) firmware_put(default_cfg, FIRMWARE_UNLOAD); return (rc); } #define FW_PARAM_DEV(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) #define FW_PARAM_PFVF(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)) /* * Partition chip resources for use between various PFs, VFs, etc. */ static int partition_resources(struct adapter *sc, const struct firmware *default_cfg, const char *name_prefix) { const struct firmware *cfg = NULL; int rc = 0; struct fw_caps_config_cmd caps; uint32_t mtype, moff, finicsum, cfcsum; /* * Figure out what configuration file to use. Pick the default config * file for the card if the user hasn't specified one explicitly. */ snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", t4_cfg_file); if (strncmp(t4_cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { /* Card specific overrides go here. */ if (pci_get_device(sc->dev) == 0x440a) snprintf(sc->cfg_file, sizeof(sc->cfg_file), UWIRE_CF); if (is_fpga(sc)) snprintf(sc->cfg_file, sizeof(sc->cfg_file), FPGA_CF); } /* * We need to load another module if the profile is anything except * "default" or "flash". */ if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) != 0 && strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { char s[32]; snprintf(s, sizeof(s), "%s_%s", name_prefix, sc->cfg_file); cfg = firmware_get(s); if (cfg == NULL) { if (default_cfg != NULL) { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the default config file instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", DEFAULT_CF); } else { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the config file on the card's flash " "instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } } } if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) == 0 && default_cfg == NULL) { device_printf(sc->dev, "default config file not available, will use the config " "file on the card's flash instead.\n"); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } if (strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { u_int cflen, i, n; const uint32_t *cfdata; uint32_t param, val, addr, off, mw_base, mw_aperture; KASSERT(cfg != NULL || default_cfg != NULL, ("%s: no config to upload", __func__)); /* * Ask the firmware where it wants us to upload the config file. */ param = FW_PARAM_DEV(CF); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* No support for config file? Shouldn't happen. */ device_printf(sc->dev, "failed to query config file location: %d.\n", rc); goto done; } mtype = G_FW_PARAMS_PARAM_Y(val); moff = G_FW_PARAMS_PARAM_Z(val) << 16; /* * XXX: sheer laziness. We deliberately added 4 bytes of * useless stuffing/comments at the end of the config file so * it's ok to simply throw away the last remaining bytes when * the config file is not an exact multiple of 4. This also * helps with the validate_mt_off_len check. */ if (cfg != NULL) { cflen = cfg->datasize & ~3; cfdata = cfg->data; } else { cflen = default_cfg->datasize & ~3; cfdata = default_cfg->data; } if (cflen > FLASH_CFG_MAX_SIZE) { device_printf(sc->dev, "config file too long (%d, max allowed is %d). " "Will try to use the config on the card, if any.\n", cflen, FLASH_CFG_MAX_SIZE); goto use_config_on_flash; } rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr); if (rc != 0) { device_printf(sc->dev, "%s: addr (%d/0x%x) or len %d is not valid: %d. " "Will try to use the config on the card, if any.\n", __func__, mtype, moff, cflen, rc); goto use_config_on_flash; } memwin_info(sc, 2, &mw_base, &mw_aperture); while (cflen) { off = position_memwin(sc, 2, addr); n = min(cflen, mw_aperture - off); for (i = 0; i < n; i += 4) t4_write_reg(sc, mw_base + off + i, *cfdata++); cflen -= n; addr += n; } } else { use_config_on_flash: mtype = FW_MEMTYPE_FLASH; moff = t4_flash_cfg_addr(sc); } bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to pre-process config file: %d " "(mtype %d, moff 0x%x).\n", rc, mtype, moff); goto done; } finicsum = be32toh(caps.finicsum); cfcsum = be32toh(caps.cfcsum); if (finicsum != cfcsum) { device_printf(sc->dev, "WARNING: config file checksum mismatch: %08x %08x\n", finicsum, cfcsum); } sc->cfcsum = cfcsum; #define LIMIT_CAPS(x) do { \ caps.x &= htobe16(t4_##x##_allowed); \ } while (0) /* * Let the firmware know what features will (not) be used so it can tune * things accordingly. */ LIMIT_CAPS(linkcaps); LIMIT_CAPS(niccaps); LIMIT_CAPS(toecaps); LIMIT_CAPS(rdmacaps); LIMIT_CAPS(iscsicaps); LIMIT_CAPS(fcoecaps); #undef LIMIT_CAPS caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL); if (rc != 0) { device_printf(sc->dev, "failed to process config file: %d.\n", rc); } done: if (cfg != NULL) firmware_put(cfg, FIRMWARE_UNLOAD); return (rc); } /* * Retrieve parameters that are needed (or nice to have) very early. */ static int get_params__pre_init(struct adapter *sc) { int rc; uint32_t param[2], val[2]; struct fw_devlog_cmd cmd; struct devlog_params *dlog = &sc->params.devlog; param[0] = FW_PARAM_DEV(PORTVEC); param[1] = FW_PARAM_DEV(CCLK); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (pre_init): %d.\n", rc); return (rc); } sc->params.portvec = val[0]; sc->params.nports = bitcount32(val[0]); sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ bzero(&cmd, sizeof(cmd)); cmd.op_to_write = htobe32(V_FW_CMD_OP(FW_DEVLOG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); cmd.retval_len16 = htobe32(FW_LEN16(cmd)); rc = -t4_wr_mbox(sc, sc->mbox, &cmd, sizeof(cmd), &cmd); if (rc != 0) { device_printf(sc->dev, "failed to get devlog parameters: %d.\n", rc); bzero(dlog, sizeof (*dlog)); rc = 0; /* devlog isn't critical for device operation */ } else { val[0] = be32toh(cmd.memtype_devlog_memaddr16_devlog); dlog->memtype = G_FW_DEVLOG_CMD_MEMTYPE_DEVLOG(val[0]); dlog->start = G_FW_DEVLOG_CMD_MEMADDR16_DEVLOG(val[0]) << 4; dlog->size = be32toh(cmd.memsize_devlog); } return (rc); } /* * Retrieve various parameters that are of interest to the driver. The device * has been initialized by the firmware at this point. */ static int get_params__post_init(struct adapter *sc) { int rc; uint32_t param[7], val[7]; struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); param[2] = FW_PARAM_PFVF(FILTER_START); param[3] = FW_PARAM_PFVF(FILTER_END); param[4] = FW_PARAM_PFVF(L2T_START); param[5] = FW_PARAM_PFVF(L2T_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init): %d.\n", rc); return (rc); } sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; sc->tids.ftid_base = val[2]; sc->tids.nftids = val[3] - val[2] + 1; sc->params.ftid_min = val[2]; sc->params.ftid_max = val[3]; sc->vres.l2t.start = val[4]; sc->vres.l2t.size = val[5] - val[4] + 1; KASSERT(sc->vres.l2t.size <= L2T_SIZE, ("%s: L2 table size (%u) larger than expected (%u)", __func__, sc->vres.l2t.size, L2T_SIZE)); /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to get card capabilities: %d.\n", rc); return (rc); } #define READ_CAPS(x) do { \ sc->x = htobe16(caps.x); \ } while (0) READ_CAPS(linkcaps); READ_CAPS(niccaps); READ_CAPS(toecaps); READ_CAPS(rdmacaps); READ_CAPS(iscsicaps); READ_CAPS(fcoecaps); if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) { param[0] = FW_PARAM_PFVF(ETHOFLD_START); param[1] = FW_PARAM_PFVF(ETHOFLD_END); param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query NIC parameters: %d.\n", rc); return (rc); } sc->tids.etid_base = val[0]; sc->params.etid_min = val[0]; sc->tids.netids = val[1] - val[0] + 1; sc->params.netids = sc->tids.netids; sc->params.eo_wr_cred = val[2]; sc->params.ethoffload = 1; } if (sc->toecaps) { /* query offload-related parameters */ param[0] = FW_PARAM_DEV(NTID); param[1] = FW_PARAM_PFVF(SERVER_START); param[2] = FW_PARAM_PFVF(SERVER_END); param[3] = FW_PARAM_PFVF(TDDP_START); param[4] = FW_PARAM_PFVF(TDDP_END); param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TOE parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; sc->vres.ddp.start = val[3]; sc->vres.ddp.size = val[4] - val[3] + 1; sc->params.ofldq_wr_cred = val[5]; sc->params.offload = 1; } if (sc->rdmacaps) { param[0] = FW_PARAM_PFVF(STAG_START); param[1] = FW_PARAM_PFVF(STAG_END); param[2] = FW_PARAM_PFVF(RQ_START); param[3] = FW_PARAM_PFVF(RQ_END); param[4] = FW_PARAM_PFVF(PBL_START); param[5] = FW_PARAM_PFVF(PBL_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(1): %d.\n", rc); return (rc); } sc->vres.stag.start = val[0]; sc->vres.stag.size = val[1] - val[0] + 1; sc->vres.rq.start = val[2]; sc->vres.rq.size = val[3] - val[2] + 1; sc->vres.pbl.start = val[4]; sc->vres.pbl.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SQRQ_START); param[1] = FW_PARAM_PFVF(SQRQ_END); param[2] = FW_PARAM_PFVF(CQ_START); param[3] = FW_PARAM_PFVF(CQ_END); param[4] = FW_PARAM_PFVF(OCQ_START); param[5] = FW_PARAM_PFVF(OCQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(2): %d.\n", rc); return (rc); } sc->vres.qp.start = val[0]; sc->vres.qp.size = val[1] - val[0] + 1; sc->vres.cq.start = val[2]; sc->vres.cq.size = val[3] - val[2] + 1; sc->vres.ocq.start = val[4]; sc->vres.ocq.size = val[5] - val[4] + 1; } if (sc->iscsicaps) { param[0] = FW_PARAM_PFVF(ISCSI_START); param[1] = FW_PARAM_PFVF(ISCSI_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query iSCSI parameters: %d.\n", rc); return (rc); } sc->vres.iscsi.start = val[0]; sc->vres.iscsi.size = val[1] - val[0] + 1; } /* * We've got the params we wanted to query via the firmware. Now grab * some others directly from the chip. */ rc = t4_read_chip_settings(sc); return (rc); } static int set_params__post_init(struct adapter *sc) { uint32_t param, val; /* ask for encapsulated CPLs */ param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); return (0); } #undef FW_PARAM_PFVF #undef FW_PARAM_DEV static void t4_set_desc(struct adapter *sc) { char buf[128]; struct adapter_params *p = &sc->params; snprintf(buf, sizeof(buf), "Chelsio %s %sNIC (rev %d), S/N:%s, " "P/N:%s, E/C:%s", p->vpd.id, is_offload(sc) ? "R" : "", chip_rev(sc), p->vpd.sn, p->vpd.pn, p->vpd.ec); device_set_desc_copy(sc->dev, buf); } static void build_medialist(struct port_info *pi, struct ifmedia *media) { int data, m; PORT_LOCK(pi); ifmedia_removeall(media); m = IFM_ETHER | IFM_FDX; data = (pi->port_type << 8) | pi->mod_type; switch(pi->port_type) { case FW_PORT_TYPE_BT_XFI: ifmedia_add(media, m | IFM_10G_T, data, NULL); break; case FW_PORT_TYPE_BT_XAUI: ifmedia_add(media, m | IFM_10G_T, data, NULL); /* fall through */ case FW_PORT_TYPE_BT_SGMII: ifmedia_add(media, m | IFM_1000_T, data, NULL); ifmedia_add(media, m | IFM_100_TX, data, NULL); ifmedia_add(media, IFM_ETHER | IFM_AUTO, data, NULL); ifmedia_set(media, IFM_ETHER | IFM_AUTO); break; case FW_PORT_TYPE_CX4: ifmedia_add(media, m | IFM_10G_CX4, data, NULL); ifmedia_set(media, m | IFM_10G_CX4); break; case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_10G_LR, data, NULL); ifmedia_set(media, m | IFM_10G_LR); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_10G_SR, data, NULL); ifmedia_set(media, m | IFM_10G_SR); break; case FW_PORT_MOD_TYPE_LRM: ifmedia_add(media, m | IFM_10G_LRM, data, NULL); ifmedia_set(media, m | IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_10G_TWINAX, data, NULL); ifmedia_set(media, m | IFM_10G_TWINAX); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, data, NULL); ifmedia_set(media, m | IFM_NONE); break; case FW_PORT_MOD_TYPE_NA: case FW_PORT_MOD_TYPE_ER: default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; case FW_PORT_TYPE_QSFP: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_40G_LR4, data, NULL); ifmedia_set(media, m | IFM_40G_LR4); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_40G_SR4, data, NULL); ifmedia_set(media, m | IFM_40G_SR4); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_40G_CR4, data, NULL); ifmedia_set(media, m | IFM_40G_CR4); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, data, NULL); ifmedia_set(media, m | IFM_NONE); break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } PORT_UNLOCK(pi); } #define FW_MAC_EXACT_CHUNK 7 /* * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ int update_mac_settings(struct ifnet *ifp, int flags) { int rc = 0; struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; uint16_t viid = 0xffff; int16_t *xact_addr_filt = NULL; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); if (ifp == pi->ifp) { viid = pi->viid; xact_addr_filt = &pi->xact_addr_filt; } #ifdef DEV_NETMAP else if (ifp == pi->nm_ifp) { viid = pi->nm_viid; xact_addr_filt = &pi->nm_xact_addr_filt; } #endif if (flags & XGMAC_MTU) mtu = ifp->if_mtu; if (flags & XGMAC_PROMISC) promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0; if (flags & XGMAC_ALLMULTI) allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0; if (flags & XGMAC_VLANEX) vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { rc = -t4_set_rxmode(sc, sc->mbox, viid, mtu, promisc, allmulti, 1, vlanex, false); if (rc) { if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); return (rc); } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); rc = t4_change_mac(sc, sc->mbox, viid, *xact_addr_filt, ucaddr, true, true); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { *xact_addr_filt = rc; rc = 0; } } if (flags & XGMAC_MCADDRS) { const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK]; int del = 1; uint64_t hash = 0; struct ifmultiaddr *ifma; int i = 0, j; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mcaddr[i] = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); MPASS(ETHER_IS_MULTICAST(mcaddr[i])); i++; if (i == FW_MAC_EXACT_CHUNK) { rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } del = 0; i = 0; } } if (i > 0) { rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } } rc = -t4_set_addr_hash(sc, sc->mbox, viid, 0, hash, 0); if (rc != 0) if_printf(ifp, "failed to set mc address hash: %d", rc); mcfail: if_maddr_runlock(ifp); } return (rc); } int begin_synchronized_op(struct adapter *sc, struct port_info *pi, int flags, char *wmesg) { int rc, pri; #ifdef WITNESS /* the caller thinks it's ok to sleep, but is it really? */ if (flags & SLEEP_OK) pause("t4slptst", 1); #endif if (INTR_OK) pri = PCATCH; else pri = 0; ADAPTER_LOCK(sc); for (;;) { if (pi && IS_DOOMED(pi)) { rc = ENXIO; goto done; } if (!IS_BUSY(sc)) { rc = 0; break; } if (!(flags & SLEEP_OK)) { rc = EBUSY; goto done; } if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) { rc = EINTR; goto done; } } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = wmesg; sc->last_op_thr = curthread; #endif done: if (!(flags & HOLD_LOCK) || rc) ADAPTER_UNLOCK(sc); return (rc); } void end_synchronized_op(struct adapter *sc, int flags) { if (flags & LOCK_HELD) ADAPTER_LOCK_ASSERT_OWNED(sc); else ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); } static int cxgbe_init_synchronized(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; int rc = 0; ASSERT_SYNCHRONIZED_OP(sc); if (isset(&sc->open_device_map, pi->port_id)) { KASSERT(ifp->if_drv_flags & IFF_DRV_RUNNING, ("mismatch between open_device_map and if_drv_flags")); return (0); /* already running */ } if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_full_init(sc)) != 0)) return (rc); /* error message displayed already */ if (!(pi->flags & PORT_INIT_DONE) && ((rc = port_full_init(pi)) != 0)) return (rc); /* error message displayed already */ rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ rc = -t4_enable_vi(sc, sc->mbox, pi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); goto done; } /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0) { sc->traceq = sc->sge.rxq[pi->first_rxq].iq.abs_id; t4_write_reg(sc, is_t4(sc) ? A_MPS_TRC_RSS_CONTROL : A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) | V_QUEUENUMBER(sc->traceq)); pi->flags |= HAS_TRACEQ; } /* all ok */ setbit(&sc->open_device_map, pi->port_id); PORT_LOCK(pi); ifp->if_drv_flags |= IFF_DRV_RUNNING; PORT_UNLOCK(pi); callout_reset(&pi->tick, hz, cxgbe_tick, pi); done: if (rc != 0) cxgbe_uninit_synchronized(pi); return (rc); } /* * Idempotent. */ static int cxgbe_uninit_synchronized(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; int rc; ASSERT_SYNCHRONIZED_OP(sc); /* * Disable the VI so that all its data in either direction is discarded * by the MPS. Leave everything else (the queues, interrupts, and 1Hz * tick) intact as the TP can deliver negative advice or data that it's * holding in its RAM (for an offloaded connection) even after the VI is * disabled. */ rc = -t4_enable_vi(sc, sc->mbox, pi->viid, false, false); if (rc) { if_printf(ifp, "disable_vi failed: %d\n", rc); return (rc); } clrbit(&sc->open_device_map, pi->port_id); PORT_LOCK(pi); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; PORT_UNLOCK(pi); pi->link_cfg.link_ok = 0; pi->link_cfg.speed = 0; pi->linkdnrc = -1; t4_os_link_changed(sc, pi->port_id, 0, -1); return (0); } /* * It is ok for this function to fail midway and return right away. t4_detach * will walk the entire sc->irq list and clean up whatever is valid. */ static int setup_intr_handlers(struct adapter *sc) { int rc, rid, p, q; char s[8]; struct irq *irq; struct port_info *pi; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; #endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; if (sc->intr_count == 1) return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); /* Multiple interrupts. */ KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); /* The first one is always error intr */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); if (rc != 0) return (rc); irq++; rid++; /* The second one is always the firmware event queue */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sc->sge.fwq, "evt"); if (rc != 0) return (rc); irq++; rid++; for_each_port(sc, p) { pi = sc->port[p]; if (pi->flags & INTR_RXQ) { for_each_rxq(pi, q, rxq) { snprintf(s, sizeof(s), "%d.%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); if (rc != 0) return (rc); irq++; rid++; } } #ifdef TCP_OFFLOAD if (pi->flags & INTR_OFLD_RXQ) { for_each_ofld_rxq(pi, q, ofld_rxq) { snprintf(s, sizeof(s), "%d,%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); if (rc != 0) return (rc); irq++; rid++; } } #endif #ifdef DEV_NETMAP if (pi->flags & INTR_NM_RXQ) { for_each_nm_rxq(pi, q, nm_rxq) { snprintf(s, sizeof(s), "%d-%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_nm_intr, nm_rxq, s); if (rc != 0) return (rc); irq++; rid++; } } #endif } MPASS(irq == &sc->irq[sc->intr_count]); return (0); } int adapter_full_init(struct adapter *sc) { int rc, i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); KASSERT((sc->flags & FULL_INIT_DONE) == 0, ("%s: FULL_INIT_DONE already", __func__)); /* * queues that belong to the adapter (not any particular port). */ rc = t4_setup_adapter_queues(sc); if (rc != 0) goto done; for (i = 0; i < nitems(sc->tq); i++) { sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq[i]); if (sc->tq[i] == NULL) { device_printf(sc->dev, "failed to allocate task queue %d\n", i); rc = ENOMEM; goto done; } taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d", device_get_nameunit(sc->dev), i); } t4_intr_enable(sc); sc->flags |= FULL_INIT_DONE; done: if (rc != 0) adapter_full_uninit(sc); return (rc); } int adapter_full_uninit(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); t4_teardown_adapter_queues(sc); for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) { taskqueue_free(sc->tq[i]); sc->tq[i] = NULL; } sc->flags &= ~FULL_INIT_DONE; return (0); } int port_full_init(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; uint16_t *rss; struct sge_rxq *rxq; int rc, i, j; ASSERT_SYNCHRONIZED_OP(sc); KASSERT((pi->flags & PORT_INIT_DONE) == 0, ("%s: PORT_INIT_DONE already", __func__)); sysctl_ctx_init(&pi->ctx); pi->flags |= PORT_SYSCTL_CTX; /* * Allocate tx/rx/fl queues for this port. */ rc = t4_setup_port_queues(pi); if (rc != 0) goto done; /* error message displayed already */ /* * Setup RSS for this port. Save a copy of the RSS table for later use. */ rss = malloc(pi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK); for (i = 0; i < pi->rss_size;) { for_each_rxq(pi, j, rxq) { rss[i++] = rxq->iq.abs_id; if (i == pi->rss_size) break; } } rc = -t4_config_rss_range(sc, sc->mbox, pi->viid, 0, pi->rss_size, rss, pi->rss_size); if (rc != 0) { if_printf(ifp, "rss_config failed: %d\n", rc); goto done; } pi->rss = rss; pi->flags |= PORT_INIT_DONE; done: if (rc != 0) port_full_uninit(pi); return (rc); } /* * Idempotent. */ int port_full_uninit(struct port_info *pi) { struct adapter *sc = pi->adapter; int i; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif if (pi->flags & PORT_INIT_DONE) { /* Need to quiesce queues. XXX: ctrl queues? */ for_each_txq(pi, i, txq) { quiesce_eq(sc, &txq->eq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { quiesce_eq(sc, &ofld_txq->eq); } #endif for_each_rxq(pi, i, rxq) { quiesce_iq(sc, &rxq->iq); quiesce_fl(sc, &rxq->fl); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { quiesce_iq(sc, &ofld_rxq->iq); quiesce_fl(sc, &ofld_rxq->fl); } #endif free(pi->rss, M_CXGBE); } t4_teardown_port_queues(pi); pi->flags &= ~PORT_INIT_DONE; return (0); } static void quiesce_eq(struct adapter *sc, struct sge_eq *eq) { EQ_LOCK(eq); eq->flags |= EQ_DOOMED; /* * Wait for the response to a credit flush if one's * pending. */ while (eq->flags & EQ_CRFLUSHED) mtx_sleep(eq, &eq->eq_lock, 0, "crflush", 0); EQ_UNLOCK(eq); callout_drain(&eq->tx_callout); /* XXX: iffy */ pause("callout", 10); /* Still iffy */ taskqueue_drain(sc->tq[eq->tx_chan], &eq->tx_task); } static void quiesce_iq(struct adapter *sc, struct sge_iq *iq) { (void) sc; /* unused */ /* Synchronize with the interrupt handler */ while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED)) pause("iqfree", 1); } static void quiesce_fl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); fl->flags |= FL_DOOMED; FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); callout_drain(&sc->sfl_callout); KASSERT((fl->flags & FL_STARVING) == 0, ("%s: still starving", __func__)); } static int t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid, driver_intr_t *handler, void *arg, char *name) { int rc; irq->rid = rid; irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid, RF_SHAREABLE | RF_ACTIVE); if (irq->res == NULL) { device_printf(sc->dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET, NULL, handler, arg, &irq->tag); if (rc != 0) { device_printf(sc->dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name, rc); } else if (name) bus_describe_intr(sc->dev, irq->res, irq->tag, name); return (rc); } static int t4_free_irq(struct adapter *sc, struct irq *irq) { if (irq->tag) bus_teardown_intr(sc->dev, irq->res, irq->tag); if (irq->res) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res); bzero(irq, sizeof(*irq)); return (0); } static void reg_block_dump(struct adapter *sc, uint8_t *buf, unsigned int start, unsigned int end) { uint32_t *p = (uint32_t *)(buf + start); for ( ; start <= end; start += sizeof(uint32_t)) *p++ = t4_read_reg(sc, start); } static void t4_get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf) { int i, n; const unsigned int *reg_ranges; static const unsigned int t4_reg_ranges[] = { 0x1008, 0x1108, 0x1180, 0x11b4, 0x11fc, 0x123c, 0x1300, 0x173c, 0x1800, 0x18fc, 0x3000, 0x30d8, 0x30e0, 0x5924, 0x5960, 0x59d4, 0x5a00, 0x5af8, 0x6000, 0x6098, 0x6100, 0x6150, 0x6200, 0x6208, 0x6240, 0x6248, 0x6280, 0x6338, 0x6370, 0x638c, 0x6400, 0x643c, 0x6500, 0x6524, 0x6a00, 0x6a38, 0x6a60, 0x6a78, 0x6b00, 0x6b84, 0x6bf0, 0x6c84, 0x6cf0, 0x6d84, 0x6df0, 0x6e84, 0x6ef0, 0x6f84, 0x6ff0, 0x7084, 0x70f0, 0x7184, 0x71f0, 0x7284, 0x72f0, 0x7384, 0x73f0, 0x7450, 0x7500, 0x7530, 0x7600, 0x761c, 0x7680, 0x76cc, 0x7700, 0x7798, 0x77c0, 0x77fc, 0x7900, 0x79fc, 0x7b00, 0x7c38, 0x7d00, 0x7efc, 0x8dc0, 0x8e1c, 0x8e30, 0x8e78, 0x8ea0, 0x8f6c, 0x8fc0, 0x9074, 0x90fc, 0x90fc, 0x9400, 0x9458, 0x9600, 0x96bc, 0x9800, 0x9808, 0x9820, 0x983c, 0x9850, 0x9864, 0x9c00, 0x9c6c, 0x9c80, 0x9cec, 0x9d00, 0x9d6c, 0x9d80, 0x9dec, 0x9e00, 0x9e6c, 0x9e80, 0x9eec, 0x9f00, 0x9f6c, 0x9f80, 0x9fec, 0xd004, 0xd03c, 0xdfc0, 0xdfe0, 0xe000, 0xea7c, 0xf000, 0x11110, 0x11118, 0x11190, 0x19040, 0x1906c, 0x19078, 0x19080, 0x1908c, 0x19124, 0x19150, 0x191b0, 0x191d0, 0x191e8, 0x19238, 0x1924c, 0x193f8, 0x19474, 0x19490, 0x194f8, 0x19800, 0x19f30, 0x1a000, 0x1a06c, 0x1a0b0, 0x1a120, 0x1a128, 0x1a138, 0x1a190, 0x1a1c4, 0x1a1fc, 0x1a1fc, 0x1e040, 0x1e04c, 0x1e284, 0x1e28c, 0x1e2c0, 0x1e2c0, 0x1e2e0, 0x1e2e0, 0x1e300, 0x1e384, 0x1e3c0, 0x1e3c8, 0x1e440, 0x1e44c, 0x1e684, 0x1e68c, 0x1e6c0, 0x1e6c0, 0x1e6e0, 0x1e6e0, 0x1e700, 0x1e784, 0x1e7c0, 0x1e7c8, 0x1e840, 0x1e84c, 0x1ea84, 0x1ea8c, 0x1eac0, 0x1eac0, 0x1eae0, 0x1eae0, 0x1eb00, 0x1eb84, 0x1ebc0, 0x1ebc8, 0x1ec40, 0x1ec4c, 0x1ee84, 0x1ee8c, 0x1eec0, 0x1eec0, 0x1eee0, 0x1eee0, 0x1ef00, 0x1ef84, 0x1efc0, 0x1efc8, 0x1f040, 0x1f04c, 0x1f284, 0x1f28c, 0x1f2c0, 0x1f2c0, 0x1f2e0, 0x1f2e0, 0x1f300, 0x1f384, 0x1f3c0, 0x1f3c8, 0x1f440, 0x1f44c, 0x1f684, 0x1f68c, 0x1f6c0, 0x1f6c0, 0x1f6e0, 0x1f6e0, 0x1f700, 0x1f784, 0x1f7c0, 0x1f7c8, 0x1f840, 0x1f84c, 0x1fa84, 0x1fa8c, 0x1fac0, 0x1fac0, 0x1fae0, 0x1fae0, 0x1fb00, 0x1fb84, 0x1fbc0, 0x1fbc8, 0x1fc40, 0x1fc4c, 0x1fe84, 0x1fe8c, 0x1fec0, 0x1fec0, 0x1fee0, 0x1fee0, 0x1ff00, 0x1ff84, 0x1ffc0, 0x1ffc8, 0x20000, 0x2002c, 0x20100, 0x2013c, 0x20190, 0x201c8, 0x20200, 0x20318, 0x20400, 0x20528, 0x20540, 0x20614, 0x21000, 0x21040, 0x2104c, 0x21060, 0x210c0, 0x210ec, 0x21200, 0x21268, 0x21270, 0x21284, 0x212fc, 0x21388, 0x21400, 0x21404, 0x21500, 0x21518, 0x2152c, 0x2153c, 0x21550, 0x21554, 0x21600, 0x21600, 0x21608, 0x21628, 0x21630, 0x2163c, 0x21700, 0x2171c, 0x21780, 0x2178c, 0x21800, 0x21c38, 0x21c80, 0x21d7c, 0x21e00, 0x21e04, 0x22000, 0x2202c, 0x22100, 0x2213c, 0x22190, 0x221c8, 0x22200, 0x22318, 0x22400, 0x22528, 0x22540, 0x22614, 0x23000, 0x23040, 0x2304c, 0x23060, 0x230c0, 0x230ec, 0x23200, 0x23268, 0x23270, 0x23284, 0x232fc, 0x23388, 0x23400, 0x23404, 0x23500, 0x23518, 0x2352c, 0x2353c, 0x23550, 0x23554, 0x23600, 0x23600, 0x23608, 0x23628, 0x23630, 0x2363c, 0x23700, 0x2371c, 0x23780, 0x2378c, 0x23800, 0x23c38, 0x23c80, 0x23d7c, 0x23e00, 0x23e04, 0x24000, 0x2402c, 0x24100, 0x2413c, 0x24190, 0x241c8, 0x24200, 0x24318, 0x24400, 0x24528, 0x24540, 0x24614, 0x25000, 0x25040, 0x2504c, 0x25060, 0x250c0, 0x250ec, 0x25200, 0x25268, 0x25270, 0x25284, 0x252fc, 0x25388, 0x25400, 0x25404, 0x25500, 0x25518, 0x2552c, 0x2553c, 0x25550, 0x25554, 0x25600, 0x25600, 0x25608, 0x25628, 0x25630, 0x2563c, 0x25700, 0x2571c, 0x25780, 0x2578c, 0x25800, 0x25c38, 0x25c80, 0x25d7c, 0x25e00, 0x25e04, 0x26000, 0x2602c, 0x26100, 0x2613c, 0x26190, 0x261c8, 0x26200, 0x26318, 0x26400, 0x26528, 0x26540, 0x26614, 0x27000, 0x27040, 0x2704c, 0x27060, 0x270c0, 0x270ec, 0x27200, 0x27268, 0x27270, 0x27284, 0x272fc, 0x27388, 0x27400, 0x27404, 0x27500, 0x27518, 0x2752c, 0x2753c, 0x27550, 0x27554, 0x27600, 0x27600, 0x27608, 0x27628, 0x27630, 0x2763c, 0x27700, 0x2771c, 0x27780, 0x2778c, 0x27800, 0x27c38, 0x27c80, 0x27d7c, 0x27e00, 0x27e04 }; static const unsigned int t5_reg_ranges[] = { 0x1008, 0x1148, 0x1180, 0x11b4, 0x11fc, 0x123c, 0x1280, 0x173c, 0x1800, 0x18fc, 0x3000, 0x3028, 0x3060, 0x30d8, 0x30e0, 0x30fc, 0x3140, 0x357c, 0x35a8, 0x35cc, 0x35ec, 0x35ec, 0x3600, 0x5624, 0x56cc, 0x575c, 0x580c, 0x5814, 0x5890, 0x58bc, 0x5940, 0x59dc, 0x59fc, 0x5a18, 0x5a60, 0x5a9c, 0x5b94, 0x5bfc, 0x6000, 0x6040, 0x6058, 0x614c, 0x7700, 0x7798, 0x77c0, 0x78fc, 0x7b00, 0x7c54, 0x7d00, 0x7efc, 0x8dc0, 0x8de0, 0x8df8, 0x8e84, 0x8ea0, 0x8f84, 0x8fc0, 0x90f8, 0x9400, 0x9470, 0x9600, 0x96f4, 0x9800, 0x9808, 0x9820, 0x983c, 0x9850, 0x9864, 0x9c00, 0x9c6c, 0x9c80, 0x9cec, 0x9d00, 0x9d6c, 0x9d80, 0x9dec, 0x9e00, 0x9e6c, 0x9e80, 0x9eec, 0x9f00, 0x9f6c, 0x9f80, 0xa020, 0xd004, 0xd03c, 0xdfc0, 0xdfe0, 0xe000, 0x11088, 0x1109c, 0x11110, 0x11118, 0x1117c, 0x11190, 0x11204, 0x19040, 0x1906c, 0x19078, 0x19080, 0x1908c, 0x19124, 0x19150, 0x191b0, 0x191d0, 0x191e8, 0x19238, 0x19290, 0x193f8, 0x19474, 0x19490, 0x194cc, 0x194f0, 0x194f8, 0x19c00, 0x19c60, 0x19c94, 0x19e10, 0x19e50, 0x19f34, 0x19f40, 0x19f50, 0x19f90, 0x19fe4, 0x1a000, 0x1a06c, 0x1a0b0, 0x1a120, 0x1a128, 0x1a138, 0x1a190, 0x1a1c4, 0x1a1fc, 0x1a1fc, 0x1e008, 0x1e00c, 0x1e040, 0x1e04c, 0x1e284, 0x1e290, 0x1e2c0, 0x1e2c0, 0x1e2e0, 0x1e2e0, 0x1e300, 0x1e384, 0x1e3c0, 0x1e3c8, 0x1e408, 0x1e40c, 0x1e440, 0x1e44c, 0x1e684, 0x1e690, 0x1e6c0, 0x1e6c0, 0x1e6e0, 0x1e6e0, 0x1e700, 0x1e784, 0x1e7c0, 0x1e7c8, 0x1e808, 0x1e80c, 0x1e840, 0x1e84c, 0x1ea84, 0x1ea90, 0x1eac0, 0x1eac0, 0x1eae0, 0x1eae0, 0x1eb00, 0x1eb84, 0x1ebc0, 0x1ebc8, 0x1ec08, 0x1ec0c, 0x1ec40, 0x1ec4c, 0x1ee84, 0x1ee90, 0x1eec0, 0x1eec0, 0x1eee0, 0x1eee0, 0x1ef00, 0x1ef84, 0x1efc0, 0x1efc8, 0x1f008, 0x1f00c, 0x1f040, 0x1f04c, 0x1f284, 0x1f290, 0x1f2c0, 0x1f2c0, 0x1f2e0, 0x1f2e0, 0x1f300, 0x1f384, 0x1f3c0, 0x1f3c8, 0x1f408, 0x1f40c, 0x1f440, 0x1f44c, 0x1f684, 0x1f690, 0x1f6c0, 0x1f6c0, 0x1f6e0, 0x1f6e0, 0x1f700, 0x1f784, 0x1f7c0, 0x1f7c8, 0x1f808, 0x1f80c, 0x1f840, 0x1f84c, 0x1fa84, 0x1fa90, 0x1fac0, 0x1fac0, 0x1fae0, 0x1fae0, 0x1fb00, 0x1fb84, 0x1fbc0, 0x1fbc8, 0x1fc08, 0x1fc0c, 0x1fc40, 0x1fc4c, 0x1fe84, 0x1fe90, 0x1fec0, 0x1fec0, 0x1fee0, 0x1fee0, 0x1ff00, 0x1ff84, 0x1ffc0, 0x1ffc8, 0x30000, 0x30030, 0x30100, 0x30144, 0x30190, 0x301d0, 0x30200, 0x30318, 0x30400, 0x3052c, 0x30540, 0x3061c, 0x30800, 0x30834, 0x308c0, 0x30908, 0x30910, 0x309ac, 0x30a00, 0x30a2c, 0x30a44, 0x30a50, 0x30a74, 0x30c24, 0x30d00, 0x30d00, 0x30d08, 0x30d14, 0x30d1c, 0x30d20, 0x30d3c, 0x30d50, 0x31200, 0x3120c, 0x31220, 0x31220, 0x31240, 0x31240, 0x31600, 0x3160c, 0x31a00, 0x31a1c, 0x31e00, 0x31e20, 0x31e38, 0x31e3c, 0x31e80, 0x31e80, 0x31e88, 0x31ea8, 0x31eb0, 0x31eb4, 0x31ec8, 0x31ed4, 0x31fb8, 0x32004, 0x32200, 0x32200, 0x32208, 0x32240, 0x32248, 0x32280, 0x32288, 0x322c0, 0x322c8, 0x322fc, 0x32600, 0x32630, 0x32a00, 0x32abc, 0x32b00, 0x32b70, 0x33000, 0x33048, 0x33060, 0x3309c, 0x330f0, 0x33148, 0x33160, 0x3319c, 0x331f0, 0x332e4, 0x332f8, 0x333e4, 0x333f8, 0x33448, 0x33460, 0x3349c, 0x334f0, 0x33548, 0x33560, 0x3359c, 0x335f0, 0x336e4, 0x336f8, 0x337e4, 0x337f8, 0x337fc, 0x33814, 0x33814, 0x3382c, 0x3382c, 0x33880, 0x3388c, 0x338e8, 0x338ec, 0x33900, 0x33948, 0x33960, 0x3399c, 0x339f0, 0x33ae4, 0x33af8, 0x33b10, 0x33b28, 0x33b28, 0x33b3c, 0x33b50, 0x33bf0, 0x33c10, 0x33c28, 0x33c28, 0x33c3c, 0x33c50, 0x33cf0, 0x33cfc, 0x34000, 0x34030, 0x34100, 0x34144, 0x34190, 0x341d0, 0x34200, 0x34318, 0x34400, 0x3452c, 0x34540, 0x3461c, 0x34800, 0x34834, 0x348c0, 0x34908, 0x34910, 0x349ac, 0x34a00, 0x34a2c, 0x34a44, 0x34a50, 0x34a74, 0x34c24, 0x34d00, 0x34d00, 0x34d08, 0x34d14, 0x34d1c, 0x34d20, 0x34d3c, 0x34d50, 0x35200, 0x3520c, 0x35220, 0x35220, 0x35240, 0x35240, 0x35600, 0x3560c, 0x35a00, 0x35a1c, 0x35e00, 0x35e20, 0x35e38, 0x35e3c, 0x35e80, 0x35e80, 0x35e88, 0x35ea8, 0x35eb0, 0x35eb4, 0x35ec8, 0x35ed4, 0x35fb8, 0x36004, 0x36200, 0x36200, 0x36208, 0x36240, 0x36248, 0x36280, 0x36288, 0x362c0, 0x362c8, 0x362fc, 0x36600, 0x36630, 0x36a00, 0x36abc, 0x36b00, 0x36b70, 0x37000, 0x37048, 0x37060, 0x3709c, 0x370f0, 0x37148, 0x37160, 0x3719c, 0x371f0, 0x372e4, 0x372f8, 0x373e4, 0x373f8, 0x37448, 0x37460, 0x3749c, 0x374f0, 0x37548, 0x37560, 0x3759c, 0x375f0, 0x376e4, 0x376f8, 0x377e4, 0x377f8, 0x377fc, 0x37814, 0x37814, 0x3782c, 0x3782c, 0x37880, 0x3788c, 0x378e8, 0x378ec, 0x37900, 0x37948, 0x37960, 0x3799c, 0x379f0, 0x37ae4, 0x37af8, 0x37b10, 0x37b28, 0x37b28, 0x37b3c, 0x37b50, 0x37bf0, 0x37c10, 0x37c28, 0x37c28, 0x37c3c, 0x37c50, 0x37cf0, 0x37cfc, 0x38000, 0x38030, 0x38100, 0x38144, 0x38190, 0x381d0, 0x38200, 0x38318, 0x38400, 0x3852c, 0x38540, 0x3861c, 0x38800, 0x38834, 0x388c0, 0x38908, 0x38910, 0x389ac, 0x38a00, 0x38a2c, 0x38a44, 0x38a50, 0x38a74, 0x38c24, 0x38d00, 0x38d00, 0x38d08, 0x38d14, 0x38d1c, 0x38d20, 0x38d3c, 0x38d50, 0x39200, 0x3920c, 0x39220, 0x39220, 0x39240, 0x39240, 0x39600, 0x3960c, 0x39a00, 0x39a1c, 0x39e00, 0x39e20, 0x39e38, 0x39e3c, 0x39e80, 0x39e80, 0x39e88, 0x39ea8, 0x39eb0, 0x39eb4, 0x39ec8, 0x39ed4, 0x39fb8, 0x3a004, 0x3a200, 0x3a200, 0x3a208, 0x3a240, 0x3a248, 0x3a280, 0x3a288, 0x3a2c0, 0x3a2c8, 0x3a2fc, 0x3a600, 0x3a630, 0x3aa00, 0x3aabc, 0x3ab00, 0x3ab70, 0x3b000, 0x3b048, 0x3b060, 0x3b09c, 0x3b0f0, 0x3b148, 0x3b160, 0x3b19c, 0x3b1f0, 0x3b2e4, 0x3b2f8, 0x3b3e4, 0x3b3f8, 0x3b448, 0x3b460, 0x3b49c, 0x3b4f0, 0x3b548, 0x3b560, 0x3b59c, 0x3b5f0, 0x3b6e4, 0x3b6f8, 0x3b7e4, 0x3b7f8, 0x3b7fc, 0x3b814, 0x3b814, 0x3b82c, 0x3b82c, 0x3b880, 0x3b88c, 0x3b8e8, 0x3b8ec, 0x3b900, 0x3b948, 0x3b960, 0x3b99c, 0x3b9f0, 0x3bae4, 0x3baf8, 0x3bb10, 0x3bb28, 0x3bb28, 0x3bb3c, 0x3bb50, 0x3bbf0, 0x3bc10, 0x3bc28, 0x3bc28, 0x3bc3c, 0x3bc50, 0x3bcf0, 0x3bcfc, 0x3c000, 0x3c030, 0x3c100, 0x3c144, 0x3c190, 0x3c1d0, 0x3c200, 0x3c318, 0x3c400, 0x3c52c, 0x3c540, 0x3c61c, 0x3c800, 0x3c834, 0x3c8c0, 0x3c908, 0x3c910, 0x3c9ac, 0x3ca00, 0x3ca2c, 0x3ca44, 0x3ca50, 0x3ca74, 0x3cc24, 0x3cd00, 0x3cd00, 0x3cd08, 0x3cd14, 0x3cd1c, 0x3cd20, 0x3cd3c, 0x3cd50, 0x3d200, 0x3d20c, 0x3d220, 0x3d220, 0x3d240, 0x3d240, 0x3d600, 0x3d60c, 0x3da00, 0x3da1c, 0x3de00, 0x3de20, 0x3de38, 0x3de3c, 0x3de80, 0x3de80, 0x3de88, 0x3dea8, 0x3deb0, 0x3deb4, 0x3dec8, 0x3ded4, 0x3dfb8, 0x3e004, 0x3e200, 0x3e200, 0x3e208, 0x3e240, 0x3e248, 0x3e280, 0x3e288, 0x3e2c0, 0x3e2c8, 0x3e2fc, 0x3e600, 0x3e630, 0x3ea00, 0x3eabc, 0x3eb00, 0x3eb70, 0x3f000, 0x3f048, 0x3f060, 0x3f09c, 0x3f0f0, 0x3f148, 0x3f160, 0x3f19c, 0x3f1f0, 0x3f2e4, 0x3f2f8, 0x3f3e4, 0x3f3f8, 0x3f448, 0x3f460, 0x3f49c, 0x3f4f0, 0x3f548, 0x3f560, 0x3f59c, 0x3f5f0, 0x3f6e4, 0x3f6f8, 0x3f7e4, 0x3f7f8, 0x3f7fc, 0x3f814, 0x3f814, 0x3f82c, 0x3f82c, 0x3f880, 0x3f88c, 0x3f8e8, 0x3f8ec, 0x3f900, 0x3f948, 0x3f960, 0x3f99c, 0x3f9f0, 0x3fae4, 0x3faf8, 0x3fb10, 0x3fb28, 0x3fb28, 0x3fb3c, 0x3fb50, 0x3fbf0, 0x3fc10, 0x3fc28, 0x3fc28, 0x3fc3c, 0x3fc50, 0x3fcf0, 0x3fcfc, 0x40000, 0x4000c, 0x40040, 0x40068, 0x4007c, 0x40144, 0x40180, 0x4018c, 0x40200, 0x40298, 0x402ac, 0x4033c, 0x403f8, 0x403fc, 0x41304, 0x413c4, 0x41400, 0x4141c, 0x41480, 0x414d0, 0x44000, 0x44078, 0x440c0, 0x44278, 0x442c0, 0x44478, 0x444c0, 0x44678, 0x446c0, 0x44878, 0x448c0, 0x449fc, 0x45000, 0x45068, 0x45080, 0x45084, 0x450a0, 0x450b0, 0x45200, 0x45268, 0x45280, 0x45284, 0x452a0, 0x452b0, 0x460c0, 0x460e4, 0x47000, 0x4708c, 0x47200, 0x47250, 0x47400, 0x47420, 0x47600, 0x47618, 0x47800, 0x47814, 0x48000, 0x4800c, 0x48040, 0x48068, 0x4807c, 0x48144, 0x48180, 0x4818c, 0x48200, 0x48298, 0x482ac, 0x4833c, 0x483f8, 0x483fc, 0x49304, 0x493c4, 0x49400, 0x4941c, 0x49480, 0x494d0, 0x4c000, 0x4c078, 0x4c0c0, 0x4c278, 0x4c2c0, 0x4c478, 0x4c4c0, 0x4c678, 0x4c6c0, 0x4c878, 0x4c8c0, 0x4c9fc, 0x4d000, 0x4d068, 0x4d080, 0x4d084, 0x4d0a0, 0x4d0b0, 0x4d200, 0x4d268, 0x4d280, 0x4d284, 0x4d2a0, 0x4d2b0, 0x4e0c0, 0x4e0e4, 0x4f000, 0x4f08c, 0x4f200, 0x4f250, 0x4f400, 0x4f420, 0x4f600, 0x4f618, 0x4f800, 0x4f814, 0x50000, 0x500cc, 0x50400, 0x50400, 0x50800, 0x508cc, 0x50c00, 0x50c00, 0x51000, 0x5101c, 0x51300, 0x51308, }; if (is_t4(sc)) { reg_ranges = &t4_reg_ranges[0]; n = nitems(t4_reg_ranges); } else { reg_ranges = &t5_reg_ranges[0]; n = nitems(t5_reg_ranges); } regs->version = chip_id(sc) | chip_rev(sc) << 10; for (i = 0; i < n; i += 2) reg_block_dump(sc, buf, reg_ranges[i], reg_ranges[i + 1]); } static void cxgbe_refresh_stats(struct adapter *sc, struct port_info *pi) { int i; u_int v, tnl_cong_drops; struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &pi->last_refreshed, <)) return; tnl_cong_drops = 0; t4_get_port_stats(sc, pi->tx_chan, &pi->stats); for (i = 0; i < NCHAN; i++) { if (pi->rx_chan_map & (1 << i)) { mtx_lock(&sc->regwin_lock); t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); mtx_unlock(&sc->regwin_lock); tnl_cong_drops += v; } } pi->tnl_cong_drops = tnl_cong_drops; getmicrotime(&pi->last_refreshed); } static void cxgbe_tick(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; PORT_LOCK(pi); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { PORT_UNLOCK(pi); return; /* without scheduling another callout */ } cxgbe_refresh_stats(sc, pi); callout_schedule(&pi->tick, hz); PORT_UNLOCK(pi); } static void cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid) { struct ifnet *vlan; if (arg != ifp || ifp->if_type != IFT_ETHER) return; vlan = VLAN_DEVAT(ifp, vid); VLAN_SETCOOKIE(vlan, ifp); } static int cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { #ifdef INVARIANTS panic("%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); #else log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n", __func__, rss->opcode, iq, m); m_freem(m); #endif return (EDOOFUS); } int t4_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h) { uintptr_t *loc, new; if (opcode >= nitems(sc->cpl_handler)) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled; loc = (uintptr_t *) &sc->cpl_handler[opcode]; atomic_store_rel_ptr(loc, new); return (0); } static int an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl) { #ifdef INVARIANTS panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl); #else log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n", __func__, iq, ctrl); #endif return (EDOOFUS); } int t4_register_an_handler(struct adapter *sc, an_handler_t h) { uintptr_t *loc, new; new = h ? (uintptr_t)h : (uintptr_t)an_not_handled; loc = (uintptr_t *) &sc->an_handler; atomic_store_rel_ptr(loc, new); return (0); } static int fw_msg_not_handled(struct adapter *sc, const __be64 *rpl) { const struct cpl_fw6_msg *cpl = __containerof(rpl, struct cpl_fw6_msg, data[0]); #ifdef INVARIANTS panic("%s: fw_msg type %d", __func__, cpl->type); #else log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type); #endif return (EDOOFUS); } int t4_register_fw_msg_handler(struct adapter *sc, int type, fw_msg_handler_t h) { uintptr_t *loc, new; if (type >= nitems(sc->fw_msg_handler)) return (EINVAL); /* * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL * handler dispatch table. Reject any attempt to install a handler for * this subtype. */ if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled; loc = (uintptr_t *) &sc->fw_msg_handler[type]; atomic_store_rel_ptr(loc, new); return (0); } static int t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *c0; static char *caps[] = { "\20\1PPP\2QFC\3DCBX", /* caps[0] linkcaps */ "\20\1NIC\2VM\3IDS\4UM\5UM_ISGL" /* caps[1] niccaps */ "\6HASHFILTER\7ETHOFLD", "\20\1TOE", /* caps[2] toecaps */ "\20\1RDDP\2RDMAC", /* caps[3] rdmacaps */ "\20\1INITIATOR_PDU\2TARGET_PDU" /* caps[4] iscsicaps */ "\3INITIATOR_CNXOFLD\4TARGET_CNXOFLD" "\5INITIATOR_SSNOFLD\6TARGET_SSNOFLD", "\20\1INITIATOR\2TARGET\3CTRL_OFLD" /* caps[5] fcoecaps */ "\4PO_INITIAOR\5PO_TARGET" }; static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"}; ctx = device_get_sysctl_ctx(sc->dev); /* * dev.t4nex.X. */ oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); sc->sc_do_rxcopy = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, NULL, chip_rev(sc), "chip hardware revision"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf", CTLFLAG_RD, sc->cfg_file, 0, "configuration file"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL, sc->cfcsum, "config file checksum"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells", CTLTYPE_STRING | CTLFLAG_RD, doorbells, sc->doorbells, sysctl_bitfield, "A", "available doorbells"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkcaps", CTLTYPE_STRING | CTLFLAG_RD, caps[0], sc->linkcaps, sysctl_bitfield, "A", "available link capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "niccaps", CTLTYPE_STRING | CTLFLAG_RD, caps[1], sc->niccaps, sysctl_bitfield, "A", "available NIC capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "toecaps", CTLTYPE_STRING | CTLFLAG_RD, caps[2], sc->toecaps, sysctl_bitfield, "A", "available TCP offload capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdmacaps", CTLTYPE_STRING | CTLFLAG_RD, caps[3], sc->rdmacaps, sysctl_bitfield, "A", "available RDMA capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "iscsicaps", CTLTYPE_STRING | CTLFLAG_RD, caps[4], sc->iscsicaps, sysctl_bitfield, "A", "available iSCSI capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoecaps", CTLTYPE_STRING | CTLFLAG_RD, caps[5], sc->fcoecaps, sysctl_bitfield, "A", "available FCoE capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL, sc->params.vpd.cclk, "core clock frequency (in KHz)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers", CTLTYPE_STRING | CTLFLAG_RD, sc->sge.timer_val, sizeof(sc->sge.timer_val), sysctl_int_array, "A", "interrupt holdoff timer values (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts", CTLTYPE_STRING | CTLFLAG_RD, sc->sge.counter_val, sizeof(sc->sge.counter_val), sysctl_int_array, "A", "interrupt holdoff packet counter values"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD, NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); t4_sge_sysctls(sc, ctx, children); sc->lro_timeout = 100; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW, &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)"); #ifdef SBUF_DRAIN /* * dev.t4nex.X.misc. Marked CTLFLAG_SKIP to avoid information overload. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc", CTLFLAG_RD | CTLFLAG_SKIP, NULL, "logs and miscellaneous information"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cctrl, "A", "congestion control"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0", CTLTYPE_STRING | CTLFLAG_RD, sc, 3, sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1", CTLTYPE_STRING | CTLFLAG_RD, sc, 4, sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5, sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_la, "A", "CIM logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ma_la, "A", "CIM MA logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2", CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3", CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge", CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)"); if (is_t5(sc)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_pif_la, "A", "CIM PIF logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_qcfg, "A", "CIM queue configuration"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cpl_stats, "A", "CPL statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ddp_stats, "A", "DDP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_devlog, "A", "firmware's device log"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_fcoe_stats, "A", "FCoE statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_hw_sched, "A", "hardware scheduler "); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_l2t, "A", "hardware L2 table"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_lb_stats, "A", "loopback statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_meminfo, "A", "memory regions"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_mps_tcam, "A", "MPS TCAM entries"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_path_mtus, "A", "path MTUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_pm_stats, "A", "PM statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_rdma_stats, "A", "RDMA statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tcp_stats, "A", "TCP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tids, "A", "TID information"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_err_stats, "A", "TP error statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_la, "A", "TP logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tx_rate, "A", "Tx rate"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ulprx_la, "A", "ULPRX logic analyzer"); if (is_t5(sc)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_wcwr_stats, "A", "write combined work requests"); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { /* * dev.t4nex.X.toe. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD, NULL, "TOE parameters"); children = SYSCTL_CHILDREN(oid); sc->tt.sndbuf = 256 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, &sc->tt.sndbuf, 0, "max hardware send buffer size"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW, &sc->tt.ddp, 0, "DDP allowed"); sc->tt.indsz = G_INDICATESIZE(t4_read_reg(sc, A_TP_PARA_REG5)); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "indsz", CTLFLAG_RW, &sc->tt.indsz, 0, "DDP max indicate size allowed"); sc->tt.ddp_thres = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp_thres", CTLFLAG_RW, &sc->tt.ddp_thres, 0, "DDP threshold"); sc->tt.rx_coalesce = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); } #endif return (0); } static int cxgbe_sysctls(struct port_info *pi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children; struct adapter *sc = pi->adapter; ctx = device_get_sysctl_ctx(pi->dev); /* * dev.cxgbe.X. */ oid = device_get_sysctl_tree(pi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING | CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down"); if (pi->port_type == FW_PORT_TYPE_BT_XAUI) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I", "PHY temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version", CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I", "PHY firmware version"); } SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD, &pi->nrxq, 0, "# of rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD, &pi->ntxq, 0, "# of tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD, &pi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &pi->first_txq, 0, "index of first tx queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_noflowq, "IU", "Reserve queue 0 for non-flowid packets"); #ifdef TCP_OFFLOAD if (is_offload(sc)) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &pi->nofldrxq, 0, "# of rx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD, &pi->nofldtxq, 0, "# of tx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq", CTLFLAG_RD, &pi->first_ofld_rxq, 0, "index of first TOE rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq", CTLFLAG_RD, &pi->first_ofld_txq, 0, "index of first TOE tx queue"); } #endif #ifdef DEV_NETMAP SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, &pi->nnmrxq, 0, "# of rx queues for netmap"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, &pi->nnmtxq, 0, "# of tx queues for netmap"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", CTLFLAG_RD, &pi->first_nm_rxq, 0, "index of first netmap rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", CTLFLAG_RD, &pi->first_nm_txq, 0, "index of first netmap tx queue"); #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_holdoff_tmr_idx, "I", "holdoff timer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_qsize_rxq, "I", "rx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_qsize_txq, "I", "tx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings", CTLTYPE_STRING | CTLFLAG_RW, pi, PAUSE_TX, sysctl_pause_settings, "A", "PAUSE settings (bit 0 = rx_pause, bit 1 = tx_pause)"); /* * dev.cxgbe.X.stats. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \ CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \ sysctl_handle_t4_reg64, "QU", desc) SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_64", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L)); SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L)); SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err", "# of frames received with bad FCS", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_len_err", "# of frames received with length error", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_64", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L)); #undef SYSCTL_ADD_T4_REG64 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \ SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \ &pi->stats.name, desc) /* We get these from port_stats and they may be stale by upto 1s */ SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets"); #undef SYSCTL_ADD_T4_PORTSTAT return (0); } static int sysctl_int_array(SYSCTL_HANDLER_ARGS) { int rc, *i; struct sbuf sb; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (i = arg1; arg2; arg2 -= sizeof(int), i++) sbuf_printf(&sb, "%d ", *i); sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } static int sysctl_bitfield(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", (int)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_btphy(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int op = arg2; struct adapter *sc = pi->adapter; u_int v; int rc; rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4btt"); if (rc) return (rc); /* XXX: magic numbers */ rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820, &v); end_synchronized_op(sc, 0); if (rc) return (rc); if (op == 0) v /= 256; rc = sysctl_handle_int(oidp, &v, 0, req); return (rc); } static int sysctl_noflowq(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int rc, val; val = pi->rsrv_noflowq; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if ((val >= 1) && (pi->ntxq > 1)) pi->rsrv_noflowq = 1; else pi->rsrv_noflowq = 0; return (rc); } static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int idx, rc, i; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif uint8_t v; idx = pi->tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(pi->pktc_idx != -1); for_each_rxq(pi, i, rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&rxq->iq.intr_params, v); #else rxq->iq.intr_params = v; #endif } #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&ofld_rxq->iq.intr_params, v); #else ofld_rxq->iq.intr_params = v; #endif } #endif pi->tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int idx, rc; idx = pi->pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4pktc"); if (rc) return (rc); if (pi->flags & PORT_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else pi->pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int qsize, rc; qsize = pi->qsize_rxq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || (qsize & 7)) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4rxqs"); if (rc) return (rc); if (pi->flags & PORT_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else pi->qsize_rxq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int qsize, rc; qsize = pi->qsize_txq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); /* bufring size must be powerof2 */ if (qsize < 128 || !powerof2(qsize)) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txqs"); if (rc) return (rc); if (pi->flags & PORT_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else pi->qsize_txq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1PAUSE_RX\2PAUSE_TX"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", lc->fc & (PAUSE_TX | PAUSE_RX), bits); rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); if (s[1] != 0) return (EINVAL); if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; if (n & ~(PAUSE_TX | PAUSE_RX)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4PAUSE"); if (rc) return (rc); if ((lc->requested_fc & (PAUSE_TX | PAUSE_RX)) != n) { int link_ok = lc->link_ok; lc->requested_fc &= ~(PAUSE_TX | PAUSE_RX); lc->requested_fc |= n; rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, lc); lc->link_ok = link_ok; /* restore */ } end_synchronized_op(sc, 0); } return (rc); } static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; uint64_t val; val = t4_read_reg64(sc, reg); return (sysctl_handle_64(oidp, &val, 0, req)); } static int sysctl_temperature(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, t; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp"); if (rc) return (rc); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); end_synchronized_op(sc, 0); if (rc) return (rc); /* unknown is returned as 0 but we display -1 in that case */ t = val == 0 ? -1 : val; rc = sysctl_handle_int(oidp, &t, 0, req); return (rc); } #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t incr[NMTUS][NCCTRL_WIN]; static const char *dec_fac[] = { "0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875", "0.9375" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); t4_read_cong_tbl(sc, incr); for (i = 0; i < NCCTRL_WIN; ++i) { sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i, incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i], incr[5][i], incr[6][i], incr[7][i]); sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n", incr[8][i], incr[9][i], incr[10][i], incr[11][i], incr[12][i], incr[13][i], incr[14][i], incr[15][i], sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = { "TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI", /* ibq's */ "ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI", /* obq's */ "SGE0-RX", "SGE1-RX" /* additional obq's (T5 onwards) */ }; static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n, qid = arg2; uint32_t *buf, *p; char *qtype; u_int cim_num_obq = is_t4(sc) ? CIM_NUM_OBQ : CIM_NUM_OBQ_T5; KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq, ("%s: bad qid %d\n", __func__, qid)); if (qid < CIM_NUM_IBQ) { /* inbound queue */ qtype = "IBQ"; n = 4 * CIM_IBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_ibq(sc, qid, buf, n); } else { /* outbound queue */ qtype = "OBQ"; qid -= CIM_NUM_IBQ; n = 4 * cim_num_obq * CIM_OBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_obq(sc, qid, buf, n); } if (rc < 0) { rc = -rc; goto done; } n = rc * sizeof(uint32_t); /* rc has # of words actually read */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]); for (i = 0, p = buf; i < n; i += 16, p += 4) sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1], p[2], p[3]); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int cfg; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_cim_read_la(sc, buf, NULL); if (rc != 0) goto done; sbuf_printf(sb, "Status Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data"); KASSERT((sc->params.cim_la_size & 7) == 0, ("%s: p will walk off the end of buf", __func__)); for (p = buf; p < &buf[sc->params.cim_la_size]; p += 8) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x", p[5] & 0xff, p[6], p[7]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x", (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8, p[4] & 0xff, p[5] >> 8); sbuf_printf(sb, "\n %02x %x%07x %x%07x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4); } else { sbuf_printf(sb, "\n %02x %x%07x %x%07x %08x %08x " "%08x%08x%08x%08x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5], p[6], p[7]); } } rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE); p = buf; for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCnt ID Tag UE Data RDY VLD"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%3u %2u %x %u %08x%08x %u %u", (p[2] >> 10) & 0xff, (p[2] >> 7) & 7, (p[2] >> 3) & 0xf, (p[2] >> 2) & 1, (p[1] >> 2) | ((p[2] & 3) << 30), (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1, p[0] & 1); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL); p = buf; sbuf_printf(sb, "Cntl ID DataBE Addr Data"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %04x %08x %08x%08x%08x%08x", (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCntl ID Data"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %08x%08x%08x%08x", (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t thres[CIM_NUM_IBQ]; uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr; uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat; u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq; if (is_t4(sc)) { cim_num_obq = CIM_NUM_OBQ; ibq_rdaddr = A_UP_IBQ_0_RDADDR; obq_rdaddr = A_UP_OBQ_0_REALADDR; } else { cim_num_obq = CIM_NUM_OBQ_T5; ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR; obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR; } nq = CIM_NUM_IBQ + cim_num_obq; rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat); if (rc == 0) rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr); if (rc != 0) return (rc); t4_read_cimq_cfg(sc, base, size, thres); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Queue Base Size Thres RdPtr WrPtr SOP EOP Avail"); for (i = 0; i < CIM_NUM_IBQ; i++, p += 4) sbuf_printf(sb, "\n%7s %5x %5u %5u %6x %4x %4u %4u %5u", qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]), G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); for ( ; i < nq; i++, p += 4, wr += 2) sbuf_printf(sb, "\n%7s %5x %5u %12x %4x %4u %4u %5u", qname[i], base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff, wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_cpl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_cpl_stats(sc, &stats); sbuf_printf(sb, " channel 0 channel 1 channel 2 " "channel 3\n"); sbuf_printf(sb, "CPL requests: %10u %10u %10u %10u\n", stats.req[0], stats.req[1], stats.req[2], stats.req[3]); sbuf_printf(sb, "CPL responses: %10u %10u %10u %10u", stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_usm_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_usm_stats(sc, &stats); sbuf_printf(sb, "Frames: %u\n", stats.frames); sbuf_printf(sb, "Octets: %ju\n", stats.octets); sbuf_printf(sb, "Drops: %u", stats.drops); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } const char *devlog_level_strings[] = { [FW_DEVLOG_LEVEL_EMERG] = "EMERG", [FW_DEVLOG_LEVEL_CRIT] = "CRIT", [FW_DEVLOG_LEVEL_ERR] = "ERR", [FW_DEVLOG_LEVEL_NOTICE] = "NOTICE", [FW_DEVLOG_LEVEL_INFO] = "INFO", [FW_DEVLOG_LEVEL_DEBUG] = "DEBUG" }; const char *devlog_facility_strings[] = { [FW_DEVLOG_FACILITY_CORE] = "CORE", [FW_DEVLOG_FACILITY_CF] = "CF", [FW_DEVLOG_FACILITY_SCHED] = "SCHED", [FW_DEVLOG_FACILITY_TIMER] = "TIMER", [FW_DEVLOG_FACILITY_RES] = "RES", [FW_DEVLOG_FACILITY_HW] = "HW", [FW_DEVLOG_FACILITY_FLR] = "FLR", [FW_DEVLOG_FACILITY_DMAQ] = "DMAQ", [FW_DEVLOG_FACILITY_PHY] = "PHY", [FW_DEVLOG_FACILITY_MAC] = "MAC", [FW_DEVLOG_FACILITY_PORT] = "PORT", [FW_DEVLOG_FACILITY_VI] = "VI", [FW_DEVLOG_FACILITY_FILTER] = "FILTER", [FW_DEVLOG_FACILITY_ACL] = "ACL", [FW_DEVLOG_FACILITY_TM] = "TM", [FW_DEVLOG_FACILITY_QFC] = "QFC", [FW_DEVLOG_FACILITY_DCB] = "DCB", [FW_DEVLOG_FACILITY_ETH] = "ETH", [FW_DEVLOG_FACILITY_OFLD] = "OFLD", [FW_DEVLOG_FACILITY_RI] = "RI", [FW_DEVLOG_FACILITY_ISCSI] = "ISCSI", [FW_DEVLOG_FACILITY_FCOE] = "FCOE", [FW_DEVLOG_FACILITY_FOISCSI] = "FOISCSI", [FW_DEVLOG_FACILITY_FOFCOE] = "FOFCOE" }; static int sysctl_devlog(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e *buf, *e; int i, j, rc, nentries, first = 0, m; struct sbuf *sb; uint64_t ftstamp = UINT64_MAX; if (dparams->start == 0) { dparams->memtype = FW_MEMTYPE_EDC0; dparams->start = 0x84000; dparams->size = 32768; } nentries = dparams->size / sizeof(struct fw_devlog_e); buf = malloc(dparams->size, M_CXGBE, M_NOWAIT); if (buf == NULL) return (ENOMEM); m = fwmtype_to_hwmtype(dparams->memtype); rc = -t4_mem_read(sc, m, dparams->start, dparams->size, (void *)buf); if (rc != 0) goto done; for (i = 0; i < nentries; i++) { e = &buf[i]; if (e->timestamp == 0) break; /* end */ e->timestamp = be64toh(e->timestamp); e->seqno = be32toh(e->seqno); for (j = 0; j < 8; j++) e->params[j] = be32toh(e->params[j]); if (e->timestamp < ftstamp) { ftstamp = e->timestamp; first = i; } } if (buf[first].timestamp == 0) goto done; /* nothing in the log */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%10s %15s %8s %8s %s\n", "Seq#", "Tstamp", "Level", "Facility", "Message"); i = first; do { e = &buf[i]; if (e->timestamp == 0) break; /* end */ sbuf_printf(sb, "%10d %15ju %8s %8s ", e->seqno, e->timestamp, (e->level < nitems(devlog_level_strings) ? devlog_level_strings[e->level] : "UNKNOWN"), (e->facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e->facility] : "UNKNOWN")); sbuf_printf(sb, e->fmt, e->params[0], e->params[1], e->params[2], e->params[3], e->params[4], e->params[5], e->params[6], e->params[7]); if (++i == nentries) i = 0; } while (i != first); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_fcoe_stats stats[4]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_fcoe_stats(sc, 0, &stats[0]); t4_get_fcoe_stats(sc, 1, &stats[1]); t4_get_fcoe_stats(sc, 2, &stats[2]); t4_get_fcoe_stats(sc, 3, &stats[3]); sbuf_printf(sb, " channel 0 channel 1 " "channel 2 channel 3\n"); sbuf_printf(sb, "octetsDDP: %16ju %16ju %16ju %16ju\n", stats[0].octetsDDP, stats[1].octetsDDP, stats[2].octetsDDP, stats[3].octetsDDP); sbuf_printf(sb, "framesDDP: %16u %16u %16u %16u\n", stats[0].framesDDP, stats[1].framesDDP, stats[2].framesDDP, stats[3].framesDDP); sbuf_printf(sb, "framesDrop: %16u %16u %16u %16u", stats[0].framesDrop, stats[1].framesDrop, stats[2].framesDrop, stats[3].framesDrop); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; unsigned int map, kbps, ipg, mode; unsigned int pace_tab[NTX_SCHED]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP); mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG)); t4_read_pace_tbl(sc, pace_tab); sbuf_printf(sb, "Scheduler Mode Channel Rate (Kbps) " "Class IPG (0.1 ns) Flow IPG (us)"); for (i = 0; i < NTX_SCHED; ++i, map >>= 2) { t4_get_tx_sched(sc, i, &kbps, &ipg); sbuf_printf(sb, "\n %u %-5s %u ", i, (mode & (1 << i)) ? "flow" : "class", map & 3); if (kbps) sbuf_printf(sb, "%9u ", kbps); else sbuf_printf(sb, " disabled "); if (ipg) sbuf_printf(sb, "%13u ", ipg); else sbuf_printf(sb, " disabled "); if (pace_tab[i]) sbuf_printf(sb, "%10u", pace_tab[i]); else sbuf_printf(sb, " disabled"); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, j; uint64_t *p0, *p1; struct lb_port_stats s[2]; static const char *stat_name[] = { "OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:", "UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:", "Frames128To255:", "Frames256To511:", "Frames512To1023:", "Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:", "BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:", "BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:", "BG2FramesTrunc:", "BG3FramesTrunc:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); memset(s, 0, sizeof(s)); for (i = 0; i < 4; i += 2) { t4_get_lb_stats(sc, i, &s[0]); t4_get_lb_stats(sc, i + 1, &s[1]); p0 = &s[0].octets; p1 = &s[1].octets; sbuf_printf(sb, "%s Loopback %u" " Loopback %u", i == 0 ? "" : "\n", i, i + 1); for (j = 0; j < nitems(stat_name); j++) sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j], *p0++, *p1++); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS) { int rc = 0; struct port_info *pi = arg1; struct sbuf *sb; static const char *linkdnreasons[] = { "non-specific", "remote fault", "autoneg failed", "reserved3", "PHY overheated", "unknown", "rx los", "reserved7" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 64, req); if (sb == NULL) return (ENOMEM); if (pi->linkdnrc < 0) sbuf_printf(sb, "n/a"); else if (pi->linkdnrc < nitems(linkdnreasons)) sbuf_printf(sb, "%s", linkdnreasons[pi->linkdnrc]); else sbuf_printf(sb, "%d", pi->linkdnrc); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct mem_desc { unsigned int base; unsigned int limit; unsigned int idx; }; static int mem_desc_cmp(const void *a, const void *b) { return ((const struct mem_desc *)a)->base - ((const struct mem_desc *)b)->base; } static void mem_region_show(struct sbuf *sb, const char *name, unsigned int from, unsigned int to) { unsigned int size; size = to - from + 1; if (size == 0) return; /* XXX: need humanize_number(3) in libkern for a more readable 'size' */ sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size); } static int sysctl_meminfo(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; uint32_t lo, hi, used, alloc; static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"}; static const char *region[] = { "DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:", "Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:", "Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:", "TDDP region:", "TPT region:", "STAG region:", "RQ region:", "RQUDP region:", "PBL region:", "TXPBL region:", "DBVFIFO region:", "ULPRX state:", "ULPTX state:", "On-chip queues:" }; struct mem_desc avail[4]; struct mem_desc mem[nitems(region) + 3]; /* up to 3 holes */ struct mem_desc *md = mem; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nitems(mem); i++) { mem[i].limit = 0; mem[i].idx = i; } /* Find and sort the populated memory ranges */ i = 0; lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (lo & F_EDRAM0_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM0_BAR); avail[i].base = G_EDRAM0_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20); avail[i].idx = 0; i++; } if (lo & F_EDRAM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM1_BAR); avail[i].base = G_EDRAM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20); avail[i].idx = 1; i++; } if (lo & F_EXT_MEM_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); avail[i].base = G_EXT_MEM_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20); avail[i].idx = is_t4(sc) ? 2 : 3; /* Call it MC for T4 */ i++; } if (!is_t4(sc) && lo & F_EXT_MEM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 4; i++; } if (!i) /* no memory available */ return 0; qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp); (md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR); (md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE); /* the next few have explicit upper bounds */ md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) * G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE)); md++; md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) * G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE)); md++; if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { hi = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4; md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE); md->limit = (sc->tids.ntids - hi) * 16 + md->base - 1; } else { md->base = 0; md->idx = nitems(region); /* hide it */ } md++; #define ulp_region(reg) \ md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\ (md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT) ulp_region(RX_ISCSI); ulp_region(RX_TDDP); ulp_region(TX_TPT); ulp_region(RX_STAG); ulp_region(RX_RQ); ulp_region(RX_RQUDP); ulp_region(RX_PBL); ulp_region(TX_PBL); #undef ulp_region md->base = 0; md->idx = nitems(region); if (!is_t4(sc) && t4_read_reg(sc, A_SGE_CONTROL2) & F_VFIFO_ENABLE) { md->base = G_BASEADDR(t4_read_reg(sc, A_SGE_DBVFIFO_BADDR)); md->limit = md->base + (G_DBVFIFO_SIZE((t4_read_reg(sc, A_SGE_DBVFIFO_SIZE))) << 2) - 1; } md++; md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE); md->limit = md->base + sc->tids.ntids - 1; md++; md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE); md->limit = md->base + sc->tids.ntids - 1; md++; md->base = sc->vres.ocq.start; if (sc->vres.ocq.size) md->limit = md->base + sc->vres.ocq.size - 1; else md->idx = nitems(region); /* hide it */ md++; /* add any address-space holes, there can be up to 3 */ for (n = 0; n < i - 1; n++) if (avail[n].limit < avail[n + 1].base) (md++)->base = avail[n].limit; if (avail[n].limit) (md++)->base = avail[n].limit; n = md - mem; qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp); for (lo = 0; lo < i; lo++) mem_region_show(sb, memory[avail[lo].idx], avail[lo].base, avail[lo].limit - 1); sbuf_printf(sb, "\n"); for (i = 0; i < n; i++) { if (mem[i].idx >= nitems(region)) continue; /* skip holes */ if (!mem[i].limit) mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0; mem_region_show(sb, region[mem[i].idx], mem[i].base, mem[i].limit); } sbuf_printf(sb, "\n"); lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP RAM:", lo, hi); lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP Extmem2:", lo, hi); lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE); sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n", G_PMRXMAXPAGE(lo), t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10, (lo & F_PMRXNUMCHN) ? 2 : 1); lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE); hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n", G_PMTXMAXPAGE(lo), hi >= (1 << 20) ? (hi >> 20) : (hi >> 10), hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo)); sbuf_printf(sb, "%u p-structs\n", t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT)); for (i = 0; i < 4; i++) { lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4); if (is_t4(sc)) { used = G_USED(lo); alloc = G_ALLOC(lo); } else { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated", i, used, alloc); } for (i = 0; i < 4; i++) { lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4); if (is_t4(sc)) { used = G_USED(lo); alloc = G_ALLOC(lo); } else { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } sbuf_printf(sb, "\nLoopback %d using %u pages out of %u allocated", i, used, alloc); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static inline void tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask) { *mask = x | y; y = htobe64(y); memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN); } static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask Vld Ports PF" " VF Replication P0 P1 P2 P3 ML"); n = is_t4(sc) ? NUM_MPS_CLS_SRAM_L_INSTANCES : NUM_MPS_T5_CLS_SRAM_L_INSTANCES; for (i = 0; i < n; i++) { uint64_t tcamx, tcamy, mask; uint32_t cls_lo, cls_hi; uint8_t addr[ETHER_ADDR_LEN]; tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i)); tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i)); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx" " %c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, (cls_lo & F_SRAM_VLD) ? 'Y' : 'N', G_PORTMAP(cls_hi), G_PF(cls_lo), (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1); if (cls_lo & F_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.fid_ctl = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_CTL(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, " ------------ error %3u ------------", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc127_96), be32toh(ldst_cmd.u.mps.rplc95_64), be32toh(ldst_cmd.u.mps.rplc63_32), be32toh(ldst_cmd.u.mps.rplc31_0)); } } else sbuf_printf(sb, "%36s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo), G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo), G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint16_t mtus[NMTUS]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_read_mtu_tbl(sc, mtus, NULL); sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u", mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6], mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13], mtus[14], mtus[15]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint32_t cnt[PM_NSTATS]; uint64_t cyc[PM_NSTATS]; static const char *rx_stats[] = { "Read:", "Write bypass:", "Write mem:", "Flush:" }; static const char *tx_stats[] = { "Read:", "Write bypass:", "Write mem:", "Bypass + mem:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_pmtx_get_stats(sc, cnt, cyc); sbuf_printf(sb, " Tx pcmds Tx bytes"); for (i = 0; i < ARRAY_SIZE(tx_stats); i++) sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], cnt[i], cyc[i]); t4_pmrx_get_stats(sc, cnt, cyc); sbuf_printf(sb, "\n Rx pcmds Rx bytes"); for (i = 0; i < ARRAY_SIZE(rx_stats); i++) sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], cnt[i], cyc[i]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_rdma_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_rdma_stats(sc, &stats); sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod); sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tcp_stats v4, v6; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_tcp_stats(sc, &v4, &v6); sbuf_printf(sb, " IP IPv6\n"); sbuf_printf(sb, "OutRsts: %20u %20u\n", v4.tcpOutRsts, v6.tcpOutRsts); sbuf_printf(sb, "InSegs: %20ju %20ju\n", v4.tcpInSegs, v6.tcpInSegs); sbuf_printf(sb, "OutSegs: %20ju %20ju\n", v4.tcpOutSegs, v6.tcpOutSegs); sbuf_printf(sb, "RetransSegs: %20ju %20ju", v4.tcpRetransSegs, v6.tcpRetransSegs); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tids(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tid_info *t = &sc->tids; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (t->natids) { sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1, t->atids_in_use); } if (t->ntids) { if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { uint32_t b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4; if (b) { sbuf_printf(sb, "TID range: 0-%u, %u-%u", b - 1, t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } else { sbuf_printf(sb, "TID range: %u-%u", t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } } else sbuf_printf(sb, "TID range: 0-%u", t->ntids - 1); sbuf_printf(sb, ", in use: %u\n", atomic_load_acq_int(&t->tids_in_use)); } if (t->nstids) { sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base, t->stid_base + t->nstids - 1, t->stids_in_use); } if (t->nftids) { sbuf_printf(sb, "FTID range: %u-%u\n", t->ftid_base, t->ftid_base + t->nftids - 1); } if (t->netids) { sbuf_printf(sb, "ETID range: %u-%u\n", t->etid_base, t->etid_base + t->netids - 1); } sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4), t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_err_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_err_stats(sc, &stats); sbuf_printf(sb, " channel 0 channel 1 channel 2 " "channel 3\n"); sbuf_printf(sb, "macInErrs: %10u %10u %10u %10u\n", stats.macInErrs[0], stats.macInErrs[1], stats.macInErrs[2], stats.macInErrs[3]); sbuf_printf(sb, "hdrInErrs: %10u %10u %10u %10u\n", stats.hdrInErrs[0], stats.hdrInErrs[1], stats.hdrInErrs[2], stats.hdrInErrs[3]); sbuf_printf(sb, "tcpInErrs: %10u %10u %10u %10u\n", stats.tcpInErrs[0], stats.tcpInErrs[1], stats.tcpInErrs[2], stats.tcpInErrs[3]); sbuf_printf(sb, "tcp6InErrs: %10u %10u %10u %10u\n", stats.tcp6InErrs[0], stats.tcp6InErrs[1], stats.tcp6InErrs[2], stats.tcp6InErrs[3]); sbuf_printf(sb, "tnlCongDrops: %10u %10u %10u %10u\n", stats.tnlCongDrops[0], stats.tnlCongDrops[1], stats.tnlCongDrops[2], stats.tnlCongDrops[3]); sbuf_printf(sb, "tnlTxDrops: %10u %10u %10u %10u\n", stats.tnlTxDrops[0], stats.tnlTxDrops[1], stats.tnlTxDrops[2], stats.tnlTxDrops[3]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u %10u %10u\n", stats.ofldVlanDrops[0], stats.ofldVlanDrops[1], stats.ofldVlanDrops[2], stats.ofldVlanDrops[3]); sbuf_printf(sb, "ofldChanDrops: %10u %10u %10u %10u\n\n", stats.ofldChanDrops[0], stats.ofldChanDrops[1], stats.ofldChanDrops[2], stats.ofldChanDrops[3]); sbuf_printf(sb, "ofldNoNeigh: %u\nofldCongDefer: %u", stats.ofldNoNeigh, stats.ofldCongDefer); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct field_desc { const char *name; u_int start; u_int width; }; static void field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f) { char buf[32]; int line_size = 0; while (f->name) { uint64_t mask = (1ULL << f->width) - 1; int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name, ((uintmax_t)v >> f->start) & mask); if (line_size + len >= 79) { line_size = 8; sbuf_printf(sb, "\n "); } sbuf_printf(sb, "%s ", buf); line_size += len + 1; f++; } sbuf_printf(sb, "\n"); } static struct field_desc tp_la0[] = { { "RcfOpCodeOut", 60, 4 }, { "State", 56, 4 }, { "WcfState", 52, 4 }, { "RcfOpcSrcOut", 50, 2 }, { "CRxError", 49, 1 }, { "ERxError", 48, 1 }, { "SanityFailed", 47, 1 }, { "SpuriousMsg", 46, 1 }, { "FlushInputMsg", 45, 1 }, { "FlushInputCpl", 44, 1 }, { "RssUpBit", 43, 1 }, { "RssFilterHit", 42, 1 }, { "Tid", 32, 10 }, { "InitTcb", 31, 1 }, { "LineNumber", 24, 7 }, { "Emsg", 23, 1 }, { "EdataOut", 22, 1 }, { "Cmsg", 21, 1 }, { "CdataOut", 20, 1 }, { "EreadPdu", 19, 1 }, { "CreadPdu", 18, 1 }, { "TunnelPkt", 17, 1 }, { "RcfPeerFin", 16, 1 }, { "RcfReasonOut", 12, 4 }, { "TxCchannel", 10, 2 }, { "RcfTxChannel", 8, 2 }, { "RxEchannel", 6, 2 }, { "RcfRxChannel", 5, 1 }, { "RcfDataOutSrdy", 4, 1 }, { "RxDvld", 3, 1 }, { "RxOoDvld", 2, 1 }, { "RxCongestion", 1, 1 }, { "TxCongestion", 0, 1 }, { NULL } }; static struct field_desc tp_la1[] = { { "CplCmdIn", 56, 8 }, { "CplCmdOut", 48, 8 }, { "ESynOut", 47, 1 }, { "EAckOut", 46, 1 }, { "EFinOut", 45, 1 }, { "ERstOut", 44, 1 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static struct field_desc tp_la2[] = { { "CplCmdIn", 56, 8 }, { "MpsVfVld", 55, 1 }, { "MpsPf", 52, 3 }, { "MpsVf", 44, 8 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static void tp_la_show(struct sbuf *sb, uint64_t *p, int idx) { field_desc_show(sb, *p, tp_la0); } static void tp_la_show2(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], tp_la0); } static void tp_la_show3(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1); } static int sysctl_tp_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint64_t *buf, *p; int rc; u_int i, inc; void (*show_func)(struct sbuf *, uint64_t *, int); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK); t4_tp_read_la(sc, buf, NULL); p = buf; switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) { case 2: inc = 2; show_func = tp_la_show2; break; case 3: inc = 2; show_func = tp_la_show3; break; default: inc = 1; show_func = tp_la_show; } for (i = 0; i < TPLA_SIZE / inc; i++, p += inc) (*show_func)(sb, p, i); rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; u64 nrate[NCHAN], orate[NCHAN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_chan_txrate(sc, nrate, orate); sbuf_printf(sb, " channel 0 channel 1 channel 2 " "channel 3\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju %10ju %10ju\n", nrate[0], nrate[1], nrate[2], nrate[3]); sbuf_printf(sb, "Offload B/s: %10ju %10ju %10ju %10ju", orate[0], orate[1], orate[2], orate[3]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint32_t *buf, *p; int rc, i; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_ulprx_read_la(sc, buf); p = buf; sbuf_printf(sb, " Pcmd Type Message" " Data"); for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) { sbuf_printf(sb, "\n%08x%08x %4x %08x %08x%08x%08x%08x", p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, v; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); v = t4_read_reg(sc, A_SGE_STAT_CFG); if (G_STATSOURCE_T5(v) == 7) { if (G_STATMODE(v) == 0) { sbuf_printf(sb, "total %d, incomplete %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } else if (G_STATMODE(v) == 1) { sbuf_printf(sb, "total %d, data overflow %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } #endif static inline void txq_start(struct ifnet *ifp, struct sge_txq *txq) { struct buf_ring *br; struct mbuf *m; TXQ_LOCK_ASSERT_OWNED(txq); br = txq->br; m = txq->m ? txq->m : drbr_dequeue(ifp, br); if (m) t4_eth_tx(ifp, txq, m); } void t4_tx_callout(void *arg) { struct sge_eq *eq = arg; struct adapter *sc; if (EQ_TRYLOCK(eq) == 0) goto reschedule; if (eq->flags & EQ_STALLED && !can_resume_tx(eq)) { EQ_UNLOCK(eq); reschedule: if (__predict_true(!(eq->flags && EQ_DOOMED))) callout_schedule(&eq->tx_callout, 1); return; } EQ_LOCK_ASSERT_OWNED(eq); if (__predict_true((eq->flags & EQ_DOOMED) == 0)) { if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) { struct sge_txq *txq = arg; struct port_info *pi = txq->ifp->if_softc; sc = pi->adapter; } else { struct sge_wrq *wrq = arg; sc = wrq->adapter; } taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task); } EQ_UNLOCK(eq); } void t4_tx_task(void *arg, int count) { struct sge_eq *eq = arg; EQ_LOCK(eq); if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) { struct sge_txq *txq = arg; txq_start(txq->ifp, txq); } else { struct sge_wrq *wrq = arg; t4_wrq_tx_locked(wrq->adapter, wrq, NULL); } EQ_UNLOCK(eq); } static uint32_t fconf_to_mode(uint32_t fconf) { uint32_t mode; mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR | T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT; if (fconf & F_FRAGMENTATION) mode |= T4_FILTER_IP_FRAGMENT; if (fconf & F_MPSHITTYPE) mode |= T4_FILTER_MPS_HIT_TYPE; if (fconf & F_MACMATCH) mode |= T4_FILTER_MAC_IDX; if (fconf & F_ETHERTYPE) mode |= T4_FILTER_ETH_TYPE; if (fconf & F_PROTOCOL) mode |= T4_FILTER_IP_PROTO; if (fconf & F_TOS) mode |= T4_FILTER_IP_TOS; if (fconf & F_VLAN) mode |= T4_FILTER_VLAN; if (fconf & F_VNIC_ID) mode |= T4_FILTER_VNIC; if (fconf & F_PORT) mode |= T4_FILTER_PORT; if (fconf & F_FCOE) mode |= T4_FILTER_FCoE; return (mode); } static uint32_t mode_to_fconf(uint32_t mode) { uint32_t fconf = 0; if (mode & T4_FILTER_IP_FRAGMENT) fconf |= F_FRAGMENTATION; if (mode & T4_FILTER_MPS_HIT_TYPE) fconf |= F_MPSHITTYPE; if (mode & T4_FILTER_MAC_IDX) fconf |= F_MACMATCH; if (mode & T4_FILTER_ETH_TYPE) fconf |= F_ETHERTYPE; if (mode & T4_FILTER_IP_PROTO) fconf |= F_PROTOCOL; if (mode & T4_FILTER_IP_TOS) fconf |= F_TOS; if (mode & T4_FILTER_VLAN) fconf |= F_VLAN; if (mode & T4_FILTER_VNIC) fconf |= F_VNIC_ID; if (mode & T4_FILTER_PORT) fconf |= F_PORT; if (mode & T4_FILTER_FCoE) fconf |= F_FCOE; return (fconf); } static uint32_t fspec_to_fconf(struct t4_filter_specification *fs) { uint32_t fconf = 0; if (fs->val.frag || fs->mask.frag) fconf |= F_FRAGMENTATION; if (fs->val.matchtype || fs->mask.matchtype) fconf |= F_MPSHITTYPE; if (fs->val.macidx || fs->mask.macidx) fconf |= F_MACMATCH; if (fs->val.ethtype || fs->mask.ethtype) fconf |= F_ETHERTYPE; if (fs->val.proto || fs->mask.proto) fconf |= F_PROTOCOL; if (fs->val.tos || fs->mask.tos) fconf |= F_TOS; if (fs->val.vlan_vld || fs->mask.vlan_vld) fconf |= F_VLAN; if (fs->val.vnic_vld || fs->mask.vnic_vld) fconf |= F_VNIC_ID; if (fs->val.iport || fs->mask.iport) fconf |= F_PORT; if (fs->val.fcoe || fs->mask.fcoe) fconf |= F_FCOE; return (fconf); } static int get_filter_mode(struct adapter *sc, uint32_t *mode) { int rc; uint32_t fconf; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4getfm"); if (rc) return (rc); t4_read_indirect(sc, A_TP_PIO_ADDR, A_TP_PIO_DATA, &fconf, 1, A_TP_VLAN_PRI_MAP); if (sc->params.tp.vlan_pri_map != fconf) { log(LOG_WARNING, "%s: cached filter mode out of sync %x %x.\n", device_get_nameunit(sc->dev), sc->params.tp.vlan_pri_map, fconf); sc->params.tp.vlan_pri_map = fconf; } *mode = fconf_to_mode(sc->params.tp.vlan_pri_map); end_synchronized_op(sc, LOCK_HELD); return (0); } static int set_filter_mode(struct adapter *sc, uint32_t mode) { uint32_t fconf; int rc; fconf = mode_to_fconf(mode); rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4setfm"); if (rc) return (rc); if (sc->tids.ftids_in_use > 0) { rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (sc->offload_map) { rc = EBUSY; goto done; } #endif #ifdef notyet rc = -t4_set_filter_mode(sc, fconf); if (rc == 0) sc->filter_mode = fconf; #else rc = ENOTSUP; #endif done: end_synchronized_op(sc, LOCK_HELD); return (rc); } static inline uint64_t get_filter_hits(struct adapter *sc, uint32_t fid) { uint32_t mw_base, off, tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); uint64_t hits; memwin_info(sc, 0, &mw_base, NULL); off = position_memwin(sc, 0, tcb_base + (fid + sc->tids.ftid_base) * TCB_SIZE); if (is_t4(sc)) { hits = t4_read_reg64(sc, mw_base + off + 16); hits = be64toh(hits); } else { hits = t4_read_reg(sc, mw_base + off + 24); hits = be32toh(hits); } return (hits); } static int get_filter(struct adapter *sc, struct t4_filter *t) { int i, rc, nfilters = sc->tids.nftids; struct filter_entry *f; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4getf"); if (rc) return (rc); if (sc->tids.ftids_in_use == 0 || sc->tids.ftid_tab == NULL || t->idx >= nfilters) { t->idx = 0xffffffff; goto done; } f = &sc->tids.ftid_tab[t->idx]; for (i = t->idx; i < nfilters; i++, f++) { if (f->valid) { t->idx = i; t->l2tidx = f->l2t ? f->l2t->idx : 0; t->smtidx = f->smtidx; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, t->idx); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: end_synchronized_op(sc, LOCK_HELD); return (0); } static int set_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters, nports; struct filter_entry *f; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setf"); if (rc) return (rc); nfilters = sc->tids.nftids; nports = sc->params.nports; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } if (t->idx >= nfilters) { rc = EINVAL; goto done; } /* Validate against the global filter mode */ if ((sc->params.tp.vlan_pri_map | fspec_to_fconf(&t->fs)) != sc->params.tp.vlan_pri_map) { rc = E2BIG; goto done; } if (t->fs.action == FILTER_SWITCH && t->fs.eport >= nports) { rc = EINVAL; goto done; } if (t->fs.val.iport >= nports) { rc = EINVAL; goto done; } /* Can't specify an iq if not steering to it */ if (!t->fs.dirsteer && t->fs.iq) { rc = EINVAL; goto done; } /* IPv6 filter idx must be 4 aligned */ if (t->fs.type == 1 && ((t->idx & 0x3) || t->idx + 4 >= nfilters)) { rc = EINVAL; goto done; } if (sc->tids.ftid_tab == NULL) { KASSERT(sc->tids.ftids_in_use == 0, ("%s: no memory allocated but filters_in_use > 0", __func__)); sc->tids.ftid_tab = malloc(sizeof (struct filter_entry) * nfilters, M_CXGBE, M_NOWAIT | M_ZERO); if (sc->tids.ftid_tab == NULL) { rc = ENOMEM; goto done; } mtx_init(&sc->tids.ftid_lock, "T4 filters", 0, MTX_DEF); } for (i = 0; i < 4; i++) { f = &sc->tids.ftid_tab[t->idx + i]; if (f->pending || f->valid) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (t->fs.type == 0) break; } f = &sc->tids.ftid_tab[t->idx]; f->fs = t->fs; rc = set_filter_wr(sc, t->idx); done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? 0 : EIO; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4setfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static int del_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters; struct filter_entry *f; int rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4delf"); if (rc) return (rc); nfilters = sc->tids.nftids; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (sc->tids.ftid_tab == NULL || sc->tids.ftids_in_use == 0 || t->idx >= nfilters) { rc = EINVAL; goto done; } if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } f = &sc->tids.ftid_tab[t->idx]; if (f->pending) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (f->valid) { t->fs = f->fs; /* extra info for the caller */ rc = del_filter_wr(sc, t->idx); } done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? EIO : 0; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4delfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static void clear_filter(struct filter_entry *f) { if (f->l2t) t4_l2t_release(f->l2t); bzero(f, sizeof (*f)); } static int set_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; ASSERT_SYNCHRONIZED_OP(sc); if (f->fs.newdmac || f->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ f->l2t = t4_l2t_alloc_switching(sc->l2t); if (f->l2t == NULL) return (EAGAIN); if (t4_l2t_set_switching(sc, f->l2t, f->fs.vlan, f->fs.eport, f->fs.dmac)) { t4_l2t_release(f->l2t); f->l2t = NULL; return (ENOMEM); } } ftid = sc->tids.ftid_base + fidx; wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); if (wr == NULL) return (ENOMEM); fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); fwr->len16_pkd = htobe32(FW_LEN16(*fwr)); fwr->tid_to_iq = htobe32(V_FW_FILTER_WR_TID(ftid) | V_FW_FILTER_WR_RQTYPE(f->fs.type) | V_FW_FILTER_WR_NOREPLY(0) | V_FW_FILTER_WR_IQ(f->fs.iq)); fwr->del_filter_to_l2tix = htobe32(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) | V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) | V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) | V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) | V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) | V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) | V_FW_FILTER_WR_DMAC(f->fs.newdmac) | V_FW_FILTER_WR_SMAC(f->fs.newsmac) | V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) | V_FW_FILTER_WR_TXCHAN(f->fs.eport) | V_FW_FILTER_WR_PRIO(f->fs.prio) | V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0)); fwr->ethtype = htobe16(f->fs.val.ethtype); fwr->ethtypem = htobe16(f->fs.mask.ethtype); fwr->frag_to_ovlan_vldm = (V_FW_FILTER_WR_FRAG(f->fs.val.frag) | V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) | V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLD(f->fs.val.vnic_vld) | V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLDM(f->fs.mask.vnic_vld)); fwr->smac_sel = 0; fwr->rx_chan_rx_rpl_iq = htobe16(V_FW_FILTER_WR_RX_CHAN(0) | V_FW_FILTER_WR_RX_RPL_IQ(sc->sge.fwq.abs_id)); fwr->maci_to_matchtypem = htobe32(V_FW_FILTER_WR_MACI(f->fs.val.macidx) | V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) | V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) | V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) | V_FW_FILTER_WR_PORT(f->fs.val.iport) | V_FW_FILTER_WR_PORTM(f->fs.mask.iport) | V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) | V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype)); fwr->ptcl = f->fs.val.proto; fwr->ptclm = f->fs.mask.proto; fwr->ttyp = f->fs.val.tos; fwr->ttypm = f->fs.mask.tos; fwr->ivlan = htobe16(f->fs.val.vlan); fwr->ivlanm = htobe16(f->fs.mask.vlan); fwr->ovlan = htobe16(f->fs.val.vnic); fwr->ovlanm = htobe16(f->fs.mask.vnic); bcopy(f->fs.val.dip, fwr->lip, sizeof (fwr->lip)); bcopy(f->fs.mask.dip, fwr->lipm, sizeof (fwr->lipm)); bcopy(f->fs.val.sip, fwr->fip, sizeof (fwr->fip)); bcopy(f->fs.mask.sip, fwr->fipm, sizeof (fwr->fipm)); fwr->lp = htobe16(f->fs.val.dport); fwr->lpm = htobe16(f->fs.mask.dport); fwr->fp = htobe16(f->fs.val.sport); fwr->fpm = htobe16(f->fs.mask.sport); if (f->fs.newsmac) bcopy(f->fs.smac, fwr->sma, sizeof (fwr->sma)); f->pending = 1; sc->tids.ftids_in_use++; t4_wrq_tx(sc, wr); return (0); } static int del_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; ftid = sc->tids.ftid_base + fidx; wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); if (wr == NULL) return (ENOMEM); fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id); f->pending = 1; t4_wrq_tx(sc, wr); return (0); } int t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); unsigned int idx = GET_TID(rpl); unsigned int rc; struct filter_entry *f; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (is_ftid(sc, idx)) { idx -= sc->tids.ftid_base; f = &sc->tids.ftid_tab[idx]; rc = G_COOKIE(rpl->cookie); mtx_lock(&sc->tids.ftid_lock); if (rc == FW_FILTER_WR_FLT_ADDED) { KASSERT(f->pending, ("%s: filter[%u] isn't pending.", __func__, idx)); f->smtidx = (be64toh(rpl->oldval) >> 24) & 0xff; f->pending = 0; /* asynchronous setup completed */ f->valid = 1; } else { if (rc != FW_FILTER_WR_FLT_DELETED) { /* Add or delete failed, display an error */ log(LOG_ERR, "filter %u setup failed with error %u\n", idx, rc); } clear_filter(f); sc->tids.ftids_in_use--; } wakeup(&sc->tids.ftid_tab); mtx_unlock(&sc->tids.ftid_lock); } return (0); } static int get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt) { int rc; if (cntxt->cid > M_CTXTQID) return (EINVAL); if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS && cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt"); if (rc) return (rc); if (sc->flags & FW_OK) { rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); if (rc == 0) goto done; } /* * Read via firmware failed or wasn't even attempted. Read directly via * the backdoor. */ rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); done: end_synchronized_op(sc, 0); return (rc); } static int load_fw(struct adapter *sc, struct t4_data *fw) { int rc; uint8_t *fw_data; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw"); if (rc) return (rc); if (sc->flags & FULL_INIT_DONE) { rc = EBUSY; goto done; } fw_data = malloc(fw->len, M_CXGBE, M_WAITOK); if (fw_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(fw->data, fw_data, fw->len); if (rc == 0) rc = -t4_load_fw(sc, fw_data, fw->len); free(fw_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) { uint32_t addr, off, remaining, i, n; uint32_t *buf, *b; uint32_t mw_base, mw_aperture; int rc; uint8_t *dst; rc = validate_mem_range(sc, mr->addr, mr->len); if (rc != 0) return (rc); memwin_info(sc, win, &mw_base, &mw_aperture); buf = b = malloc(min(mr->len, mw_aperture), M_CXGBE, M_WAITOK); addr = mr->addr; remaining = mr->len; dst = (void *)mr->data; while (remaining) { off = position_memwin(sc, win, addr); /* number of bytes that we'll copy in the inner loop */ n = min(remaining, mw_aperture - off); for (i = 0; i < n; i += 4) *b++ = t4_read_reg(sc, mw_base + off + i); rc = copyout(buf, dst, n); if (rc != 0) break; b = buf; dst += n; remaining -= n; addr += n; } free(buf, M_CXGBE); return (rc); } static int read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd) { int rc; if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports) return (EINVAL); if (i2cd->len > sizeof(i2cd->data)) return (EFBIG); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr, i2cd->offset, i2cd->len, &i2cd->data[0]); end_synchronized_op(sc, 0); return (rc); } static int in_range(int val, int lo, int hi) { return (val < 0 || (val <= hi && val >= lo)); } static int set_sched_class(struct adapter *sc, struct t4_sched_params *p) { int fw_subcmd, fw_type, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsc"); if (rc) return (rc); if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } /* * Translate the cxgbetool parameters into T4 firmware parameters. (The * sub-command and type are in common locations.) */ if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG) fw_subcmd = FW_SCHED_SC_CONFIG; else if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS) fw_subcmd = FW_SCHED_SC_PARAMS; else { rc = EINVAL; goto done; } if (p->type == SCHED_CLASS_TYPE_PACKET) fw_type = FW_SCHED_TYPE_PKTSCHED; else { rc = EINVAL; goto done; } if (fw_subcmd == FW_SCHED_SC_CONFIG) { /* Vet our parameters ..*/ if (p->u.config.minmax < 0) { rc = EINVAL; goto done; } /* And pass the request to the firmware ...*/ rc = -t4_sched_config(sc, fw_type, p->u.config.minmax, 1); goto done; } if (fw_subcmd == FW_SCHED_SC_PARAMS) { int fw_level; int fw_mode; int fw_rateunit; int fw_ratemode; if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL; else if (p->u.params.level == SCHED_CLASS_LEVEL_CL_WRR) fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR; else if (p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL; else { rc = EINVAL; goto done; } if (p->u.params.mode == SCHED_CLASS_MODE_CLASS) fw_mode = FW_SCHED_PARAMS_MODE_CLASS; else if (p->u.params.mode == SCHED_CLASS_MODE_FLOW) fw_mode = FW_SCHED_PARAMS_MODE_FLOW; else { rc = EINVAL; goto done; } if (p->u.params.rateunit == SCHED_CLASS_RATEUNIT_BITS) fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; else if (p->u.params.rateunit == SCHED_CLASS_RATEUNIT_PKTS) fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE; else { rc = EINVAL; goto done; } if (p->u.params.ratemode == SCHED_CLASS_RATEMODE_REL) fw_ratemode = FW_SCHED_PARAMS_RATE_REL; else if (p->u.params.ratemode == SCHED_CLASS_RATEMODE_ABS) fw_ratemode = FW_SCHED_PARAMS_RATE_ABS; else { rc = EINVAL; goto done; } /* Vet our parameters ... */ if (!in_range(p->u.params.channel, 0, 3) || !in_range(p->u.params.cl, 0, is_t4(sc) ? 15 : 16) || !in_range(p->u.params.minrate, 0, 10000000) || !in_range(p->u.params.maxrate, 0, 10000000) || !in_range(p->u.params.weight, 0, 100)) { rc = ERANGE; goto done; } /* * Translate any unset parameters into the firmware's * nomenclature and/or fail the call if the parameters * are required ... */ if (p->u.params.rateunit < 0 || p->u.params.ratemode < 0 || p->u.params.channel < 0 || p->u.params.cl < 0) { rc = EINVAL; goto done; } if (p->u.params.minrate < 0) p->u.params.minrate = 0; if (p->u.params.maxrate < 0) { if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL || p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) { rc = EINVAL; goto done; } else p->u.params.maxrate = 0; } if (p->u.params.weight < 0) { if (p->u.params.level == SCHED_CLASS_LEVEL_CL_WRR) { rc = EINVAL; goto done; } else p->u.params.weight = 0; } if (p->u.params.pktsize < 0) { if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL || p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) { rc = EINVAL; goto done; } else p->u.params.pktsize = 0; } /* See what the firmware thinks of the request ... */ rc = -t4_sched_params(sc, fw_type, fw_level, fw_mode, fw_rateunit, fw_ratemode, p->u.params.channel, p->u.params.cl, p->u.params.minrate, p->u.params.maxrate, p->u.params.weight, p->u.params.pktsize, 1); goto done; } rc = EINVAL; done: end_synchronized_op(sc, 0); return (rc); } static int set_sched_queue(struct adapter *sc, struct t4_sched_queue *p) { struct port_info *pi = NULL; struct sge_txq *txq; uint32_t fw_mnem, fw_queue, fw_class; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsq"); if (rc) return (rc); if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } if (p->port >= sc->params.nports) { rc = EINVAL; goto done; } pi = sc->port[p->port]; if (!in_range(p->queue, 0, pi->ntxq - 1) || !in_range(p->cl, 0, 7)) { rc = EINVAL; goto done; } /* * Create a template for the FW_PARAMS_CMD mnemonic and value (TX * Scheduling Class in this case). */ fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH)); fw_class = p->cl < 0 ? 0xffffffff : p->cl; /* * If op.queue is non-negative, then we're only changing the scheduling * on a single specified TX queue. */ if (p->queue >= 0) { txq = &sc->sge.txq[pi->first_txq + p->queue]; fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); goto done; } /* * Change the scheduling on all the TX queues for the * interface. */ for_each_txq(pi, i, txq) { fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); if (rc) goto done; } rc = 0; done: end_synchronized_op(sc, 0); return (rc); } int t4_os_find_pci_capability(struct adapter *sc, int cap) { int i; return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0); } int t4_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t4_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } void t4_os_portmod_changed(const struct adapter *sc, int idx) { struct port_info *pi = sc->port[idx]; static const char *mod_str[] = { NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; build_medialist(pi, &pi->media); #ifdef DEV_NETMAP build_medialist(pi, &pi->nm_media); #endif if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) if_printf(pi->ifp, "transceiver unplugged.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN) if_printf(pi->ifp, "unknown transceiver inserted.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED) if_printf(pi->ifp, "unsupported transceiver inserted.\n"); else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) { if_printf(pi->ifp, "%s transceiver inserted.\n", mod_str[pi->mod_type]); } else { if_printf(pi->ifp, "transceiver (type %d) inserted.\n", pi->mod_type); } } void t4_os_link_changed(struct adapter *sc, int idx, int link_stat, int reason) { struct port_info *pi = sc->port[idx]; struct ifnet *ifp = pi->ifp; if (link_stat) { pi->linkdnrc = -1; ifp->if_baudrate = IF_Mbps(pi->link_cfg.speed); if_link_state_change(ifp, LINK_STATE_UP); } else { if (reason >= 0) pi->linkdnrc = reason; if_link_state_change(ifp, LINK_STATE_DOWN); } } void t4_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; sx_slock(&t4_list_lock); SLIST_FOREACH(sc, &t4_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } sx_sunlock(&t4_list_lock); } static int t4_open(struct cdev *dev, int flags, int type, struct thread *td) { return (0); } static int t4_close(struct cdev *dev, int flags, int type, struct thread *td) { return (0); } static int t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int rc; struct adapter *sc = dev->si_drv1; rc = priv_check(td, PRIV_DRIVER); if (rc != 0) return (rc); switch (cmd) { case CHELSIO_T4_GETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) edata->val = t4_read_reg(sc, edata->addr); else if (edata->size == 8) edata->val = t4_read_reg64(sc, edata->addr); else return (EINVAL); break; } case CHELSIO_T4_SETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) { if (edata->val & 0xffffffff00000000) return (EINVAL); t4_write_reg(sc, edata->addr, (uint32_t) edata->val); } else if (edata->size == 8) t4_write_reg64(sc, edata->addr, edata->val); else return (EINVAL); break; } case CHELSIO_T4_REGDUMP: { struct t4_regdump *regs = (struct t4_regdump *)data; int reglen = is_t4(sc) ? T4_REGDUMP_SIZE : T5_REGDUMP_SIZE; uint8_t *buf; if (regs->len < reglen) { regs->len = reglen; /* hint to the caller */ return (ENOBUFS); } regs->len = reglen; buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO); t4_get_regs(sc, regs, buf); rc = copyout(buf, regs->data, reglen); free(buf, M_CXGBE); break; } case CHELSIO_T4_GET_FILTER_MODE: rc = get_filter_mode(sc, (uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MODE: rc = set_filter_mode(sc, *(uint32_t *)data); break; case CHELSIO_T4_GET_FILTER: rc = get_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_SET_FILTER: rc = set_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_DEL_FILTER: rc = del_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_GET_SGE_CONTEXT: rc = get_sge_context(sc, (struct t4_sge_context *)data); break; case CHELSIO_T4_LOAD_FW: rc = load_fw(sc, (struct t4_data *)data); break; case CHELSIO_T4_GET_MEM: rc = read_card_mem(sc, 2, (struct t4_mem_range *)data); break; case CHELSIO_T4_GET_I2C: rc = read_i2c(sc, (struct t4_i2c_data *)data); break; case CHELSIO_T4_CLEAR_STATS: { int i; u_int port_id = *(uint32_t *)data; struct port_info *pi; if (port_id >= sc->params.nports) return (EINVAL); pi = sc->port[port_id]; /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); if (pi->flags & PORT_INIT_DONE) { struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; for_each_rxq(pi, i, rxq) { #if defined(INET) || defined(INET6) rxq->lro.lro_queued = 0; rxq->lro.lro_flushed = 0; #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; } for_each_txq(pi, i, txq) { txq->txcsum = 0; txq->tso_wrs = 0; txq->vlan_insertion = 0; txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; txq->txpkts_wrs = 0; txq->txpkts_pkts = 0; txq->br->br_drops = 0; txq->no_dmamap = 0; txq->no_desc = 0; } #ifdef TCP_OFFLOAD /* nothing to clear for each ofld_rxq */ for_each_ofld_txq(pi, i, wrq) { wrq->tx_wrs = 0; wrq->no_desc = 0; } #endif wrq = &sc->sge.ctrlq[pi->port_id]; wrq->tx_wrs = 0; wrq->no_desc = 0; } break; } case CHELSIO_T4_SCHED_CLASS: rc = set_sched_class(sc, (struct t4_sched_params *)data); break; case CHELSIO_T4_SCHED_QUEUE: rc = set_sched_queue(sc, (struct t4_sched_queue *)data); break; case CHELSIO_T4_GET_TRACER: rc = t4_get_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_SET_TRACER: rc = t4_set_tracer(sc, (struct t4_tracer *)data); break; default: rc = EINVAL; } return (rc); } #ifdef TCP_OFFLOAD void t4_iscsi_init(struct ifnet *ifp, unsigned int tag_mask, const unsigned int *pgsz_order) { struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; t4_write_reg(sc, A_ULP_RX_ISCSI_TAGMASK, tag_mask); t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, V_HPZ0(pgsz_order[0]) | V_HPZ1(pgsz_order[1]) | V_HPZ2(pgsz_order[2]) | V_HPZ3(pgsz_order[3])); } static int toe_capability(struct port_info *pi, int enable) { int rc; struct adapter *sc = pi->adapter; ASSERT_SYNCHRONIZED_OP(sc); if (!is_offload(sc)) return (ENODEV); if (enable) { if (!(sc->flags & FULL_INIT_DONE)) { rc = cxgbe_init_synchronized(pi); if (rc) return (rc); } if (isset(&sc->offload_map, pi->port_id)) return (0); if (!(sc->flags & TOM_INIT_DONE)) { rc = t4_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t4_tom.ko before trying " "to enable TOE on a cxgbe interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(sc->flags & TOM_INIT_DONE, ("%s: TOM activated but flag not set", __func__)); } setbit(&sc->offload_map, pi->port_id); } else { if (!isset(&sc->offload_map, pi->port_id)) return (0); KASSERT(sc->flags & TOM_INIT_DONE, ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t4_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t4_uld_list, ui, link); ui->refcount = 0; done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t4_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_activate_uld(struct adapter *sc, int id) { int rc = EAGAIN; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { if (!(sc->flags & FULL_INIT_DONE)) { rc = adapter_full_init(sc); if (rc != 0) goto done; } rc = ui->activate(sc); if (rc == 0) ui->refcount++; goto done; } } done: sx_sunlock(&t4_uld_list_lock); return (rc); } int t4_deactivate_uld(struct adapter *sc, int id) { int rc = EINVAL; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) ui->refcount--; goto done; } } done: sx_sunlock(&t4_uld_list_lock); return (rc); } #endif /* * Come up with reasonable defaults for some of the tunables, provided they're * not set by the user (in which case we'll use the values as is). */ static void tweak_tunables(void) { int nc = mp_ncpus; /* our snapshot of the number of CPUs */ if (t4_ntxq10g < 1) t4_ntxq10g = min(nc, NTXQ_10G); if (t4_ntxq1g < 1) t4_ntxq1g = min(nc, NTXQ_1G); if (t4_nrxq10g < 1) t4_nrxq10g = min(nc, NRXQ_10G); if (t4_nrxq1g < 1) t4_nrxq1g = min(nc, NRXQ_1G); #ifdef TCP_OFFLOAD if (t4_nofldtxq10g < 1) t4_nofldtxq10g = min(nc, NOFLDTXQ_10G); if (t4_nofldtxq1g < 1) t4_nofldtxq1g = min(nc, NOFLDTXQ_1G); if (t4_nofldrxq10g < 1) t4_nofldrxq10g = min(nc, NOFLDRXQ_10G); if (t4_nofldrxq1g < 1) t4_nofldrxq1g = min(nc, NOFLDRXQ_1G); if (t4_toecaps_allowed == -1) t4_toecaps_allowed = FW_CAPS_CONFIG_TOE; #else if (t4_toecaps_allowed == -1) t4_toecaps_allowed = 0; #endif #ifdef DEV_NETMAP if (t4_nnmtxq10g < 1) t4_nnmtxq10g = min(nc, NNMTXQ_10G); if (t4_nnmtxq1g < 1) t4_nnmtxq1g = min(nc, NNMTXQ_1G); if (t4_nnmrxq10g < 1) t4_nnmrxq10g = min(nc, NNMRXQ_10G); if (t4_nnmrxq1g < 1) t4_nnmrxq1g = min(nc, NNMRXQ_1G); #endif if (t4_tmr_idx_10g < 0 || t4_tmr_idx_10g >= SGE_NTIMERS) t4_tmr_idx_10g = TMR_IDX_10G; if (t4_pktc_idx_10g < -1 || t4_pktc_idx_10g >= SGE_NCOUNTERS) t4_pktc_idx_10g = PKTC_IDX_10G; if (t4_tmr_idx_1g < 0 || t4_tmr_idx_1g >= SGE_NTIMERS) t4_tmr_idx_1g = TMR_IDX_1G; if (t4_pktc_idx_1g < -1 || t4_pktc_idx_1g >= SGE_NCOUNTERS) t4_pktc_idx_1g = PKTC_IDX_1G; if (t4_qsize_txq < 128) t4_qsize_txq = 128; if (t4_qsize_rxq < 128) t4_qsize_rxq = 128; while (t4_qsize_rxq & 7) t4_qsize_rxq++; t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX; } static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); static int mod_event(module_t mod, int cmd, void *arg) { int rc = 0; static int loaded = 0; switch (cmd) { case MOD_LOAD: sx_xlock(&mlu); if (loaded++ == 0) { t4_sge_modload(); sx_init(&t4_list_lock, "T4/T5 adapters"); SLIST_INIT(&t4_list); #ifdef TCP_OFFLOAD sx_init(&t4_uld_list_lock, "T4/T5 ULDs"); SLIST_INIT(&t4_uld_list); #endif t4_tracer_modload(); tweak_tunables(); } sx_xunlock(&mlu); break; case MOD_UNLOAD: sx_xlock(&mlu); if (--loaded == 0) { int tries; sx_slock(&t4_list_lock); if (!SLIST_EMPTY(&t4_list)) { rc = EBUSY; sx_sunlock(&t4_list_lock); goto done_unload; } #ifdef TCP_OFFLOAD sx_slock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; sx_sunlock(&t4_uld_list_lock); sx_sunlock(&t4_list_lock); goto done_unload; } #endif tries = 0; while (tries++ < 5 && t4_sge_extfree_refs() != 0) { uprintf("%ju clusters with custom free routine " "still is use.\n", t4_sge_extfree_refs()); pause("t4unload", 2 * hz); } #ifdef TCP_OFFLOAD sx_sunlock(&t4_uld_list_lock); #endif sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { t4_tracer_modunload(); #ifdef TCP_OFFLOAD sx_destroy(&t4_uld_list_lock); #endif sx_destroy(&t4_list_lock); t4_sge_modunload(); loaded = 0; } else { rc = EBUSY; loaded++; /* undo earlier decrement */ } } done_unload: sx_xunlock(&mlu); break; } return (rc); } static devclass_t t4_devclass, t5_devclass; static devclass_t cxgbe_devclass, cxl_devclass; DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0); MODULE_VERSION(t4nex, 1); MODULE_DEPEND(t4nex, firmware, 1, 1, 1); DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0); MODULE_VERSION(t5nex, 1); MODULE_DEPEND(t5nex, firmware, 1, 1, 1); DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0); MODULE_VERSION(cxgbe, 1); DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0); MODULE_VERSION(cxl, 1); Index: head/sys/dev/cxgbe/t4_sge.c =================================================================== --- head/sys/dev/cxgbe/t4_sge.c (revision 275357) +++ head/sys/dev/cxgbe/t4_sge.c (revision 275358) @@ -1,4558 +1,4558 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_NETMAP #include #include #include #include #include #endif #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" #ifdef T4_PKT_TIMESTAMP #define RX_COPY_THRESHOLD (MINCLSIZE - 8) #else #define RX_COPY_THRESHOLD MINCLSIZE #endif /* * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ int fl_pktshift = 2; TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift); /* * Pad ethernet payload up to this boundary. * -1: driver should figure out a good value. * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ int fl_pad = -1; TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad); /* * Status page length. * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ int spg_len = -1; TUNABLE_INT("hw.cxgbe.spg_len", &spg_len); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. */ static int cong_drop = 0; TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop); /* * Deliver multiple frames in the same free list buffer if they fit. * -1: let the driver decide whether to enable buffer packing or not. * 0: disable buffer packing. * 1: enable buffer packing. */ static int buffer_packing = -1; TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. * T4: * --- * if fl_pad != 0 * value specified here will be overridden by fl_pad. * else * power of 2 from 32 to 4096 (both inclusive) is a valid value here. * T5: * --- * 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; static int t4_fl_pack; static int t5_fl_pack; TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack); /* * Allow the driver to create mbuf(s) in a cluster allocated for rx. * 0: never; always allocate mbufs from the zone_mbuf UMA zone. * 1: ok to create mbuf(s) within a cluster if there is room. */ static int allow_mbufs_in_cluster = 1; TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster); /* * Largest rx cluster size that the driver is allowed to allocate. */ static int largest_rx_cluster = MJUM16BYTES; TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster); /* * Size of cluster allocation that's most likely to succeed. The driver will * fall back to this size if it fails to allocate clusters larger than this. */ static int safest_rx_cluster = PAGE_SIZE; TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster); /* Used to track coalesced tx work request */ struct txpkts { uint64_t *flitp; /* ptr to flit where next pkt should start */ uint8_t npkt; /* # of packets in this work request */ uint8_t nflits; /* # of flits used by this work request */ uint16_t plen; /* total payload (sum of all packets) */ }; /* A packet's SGL. This + m_pkthdr has all info needed for tx */ struct sgl { int nsegs; /* # of segments in the SGL, 0 means imm. tx */ int nflits; /* # of flits needed for the SGL */ bus_dma_segment_t seg[TX_SGL_SEGS]; }; static int service_iq(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, int, char *); static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t, char *); static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *, int, int); static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *); static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_fl *); static int alloc_fwq(struct adapter *); static int free_fwq(struct adapter *); static int alloc_mgmtq(struct adapter *); static int free_mgmtq(struct adapter *); static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int, struct sysctl_oid *); static int free_rxq(struct port_info *, struct sge_rxq *); #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *); #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct port_info *, struct sge_nm_rxq *, int, int, struct sysctl_oid *); static int free_nm_rxq(struct port_info *, struct sge_nm_rxq *); static int alloc_nm_txq(struct port_info *, struct sge_nm_txq *, int, int, struct sysctl_oid *); static int free_nm_txq(struct port_info *, struct sge_nm_txq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *); #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *); static int free_eq(struct adapter *, struct sge_eq *); static int alloc_wrq(struct adapter *, struct port_info *, struct sge_wrq *, struct sysctl_oid *); static int free_wrq(struct adapter *, struct sge_wrq *); static int alloc_txq(struct port_info *, struct sge_txq *, int, struct sysctl_oid *); static int free_txq(struct port_info *, struct sge_txq *); static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int alloc_fl_sdesc(struct sge_fl *); static void free_fl_sdesc(struct adapter *, struct sge_fl *); static void find_best_refill_source(struct adapter *, struct sge_fl *, int); static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int); static int free_pkt_sgl(struct sge_txq *, struct sgl *); static int write_txpkt_wr(struct port_info *, struct sge_txq *, struct mbuf *, struct sgl *); static int add_to_txpkts(struct port_info *, struct sge_txq *, struct txpkts *, struct mbuf *, struct sgl *); static void write_txpkts_wr(struct sge_txq *, struct txpkts *); static inline void write_ulp_cpl_sgl(struct port_info *, struct sge_txq *, struct txpkts *, struct mbuf *, struct sgl *); static int write_sgl_to_txd(struct sge_eq *, struct sgl *, caddr_t *); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *); static inline int reclaimable(struct sge_eq *); static int reclaim_tx_descs(struct sge_txq *, int, int); static void write_eqflush_wr(struct sge_eq *); static __be64 get_flit(bus_dma_segment_t *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); static counter_u64_t extfree_refs; static counter_u64_t extfree_rels; /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. */ void t4_sge_modload(void) { int pad; /* set pad to a reasonable powerof2 between 16 and 4096 (inclusive) */ #if defined(__i386__) || defined(__amd64__) pad = max(cpu_clflush_line_size, 16); #else pad = max(CACHE_LINE_SIZE, 16); #endif pad = min(pad, 4096); if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," " using 2 instead.\n", fl_pktshift); fl_pktshift = 2; } if (fl_pad != 0 && (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad))) { if (fl_pad != -1) { printf("Invalid hw.cxgbe.fl_pad value (%d)," " using %d instead.\n", fl_pad, max(pad, 32)); } fl_pad = max(pad, 32); } /* * T4 has the same pad and pack boundary. If a pad boundary is set, * pack boundary must be set to the same value. Otherwise take the * specified value or auto-calculate something reasonable. */ if (fl_pad) t4_fl_pack = fl_pad; else if (fl_pack < 32 || fl_pack > 4096 || !powerof2(fl_pack)) t4_fl_pack = max(pad, 32); else t4_fl_pack = fl_pack; /* T5's pack boundary is independent of the pad boundary. */ if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) t5_fl_pack = max(pad, CACHE_LINE_SIZE); else t5_fl_pack = fl_pack; if (spg_len != 64 && spg_len != 128) { int len; #if defined(__i386__) || defined(__amd64__) len = cpu_clflush_line_size > 64 ? 128 : 64; #else len = 64; #endif if (spg_len != -1) { printf("Invalid hw.cxgbe.spg_len value (%d)," " using %d instead.\n", spg_len, len); } spg_len = len; } if (cong_drop < -1 || cong_drop > 1) { printf("Invalid hw.cxgbe.cong_drop value (%d)," " using 0 instead.\n", cong_drop); cong_drop = 0; } extfree_refs = counter_u64_alloc(M_WAITOK); extfree_rels = counter_u64_alloc(M_WAITOK); counter_u64_zero(extfree_refs); counter_u64_zero(extfree_rels); } void t4_sge_modunload(void) { counter_u64_free(extfree_refs); counter_u64_free(extfree_rels); } uint64_t t4_sge_extfree_refs(void) { uint64_t refs, rels; rels = counter_u64_fetch(extfree_rels); refs = counter_u64_fetch(extfree_refs); return (refs - rels); } void t4_init_sge_cpl_handlers(struct adapter *sc) { t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_msg); t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_msg); t4_register_cpl_handler(sc, CPL_SGE_EGR_UPDATE, handle_sge_egr_update); t4_register_cpl_handler(sc, CPL_RX_PKT, t4_eth_rx); t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); } /* * adap->params.vpd.cclk must be set up before this is called. */ void t4_tweak_chip_settings(struct adapter *sc) { int i; uint32_t v, m; int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sge_flbuf_sizes[] = { MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, MJUMPAGESIZE - CL_METADATA_SIZE, MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE, #endif MJUM9BYTES, MJUM16BYTES, MCLBYTES - MSIZE - CL_METADATA_SIZE, MJUM9BYTES - CL_METADATA_SIZE, MJUM16BYTES - CL_METADATA_SIZE, }; KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); if (is_t4(sc) && (fl_pad || buffer_packing)) { /* t4_fl_pack has the correct value even when fl_pad = 0 */ m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5); } else if (is_t5(sc) && fl_pad) { m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5); } t4_set_reg_field(sc, A_SGE_CONTROL, m, v); if (is_t5(sc) && buffer_packing) { m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (t5_fl_pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5); t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); } v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES, ("%s: hw buffer size table too big", __func__)); for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) { t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i), sge_flbuf_sizes[i]); } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); KASSERT(intr_timer[0] <= timer_max, ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], timer_max)); for (i = 1; i < nitems(intr_timer); i++) { KASSERT(intr_timer[i] >= intr_timer[i - 1], ("%s: timers not listed in increasing order (%d)", __func__, i)); while (intr_timer[i] > timer_max) { if (i == nitems(intr_timer) - 1) { intr_timer[i] = timer_max; break; } intr_timer[i] += intr_timer[i - 1]; intr_timer[i] /= 2; } } v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); if (cong_drop == 0) { m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 | F_TUNNELCNGDROP3; t4_set_reg_field(sc, A_TP_PARA_REG3, m, 0); } /* 4K, 16K, 64K, 256K DDP "page sizes" */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); m = v = F_TDDPTAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } /* * SGE wants the buffer to be at least 64B and then a multiple of the pad * boundary or 16, whichever is greater. */ static inline int hwsz_ok(int hwsz) { int mask = max(fl_pad, 16) - 1; return (hwsz >= 64 && (hwsz & mask) == 0); } /* * XXX: driver really should be able to deal with unexpected settings. */ int t4_read_chip_settings(struct adapter *sc) { struct sge *s = &sc->sge; int i, j, n, rc = 0; uint32_t m, v, r; uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, #endif MJUM9BYTES, MJUM16BYTES }; struct sw_zone_info *swz, *safe_swz; struct hw_buf_info *hwb; m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); if (is_t4(sc) && (fl_pad || buffer_packing)) { m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5); } else if (is_t5(sc) && fl_pad) { m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5); } r = t4_read_reg(sc, A_SGE_CONTROL); if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } if (is_t5(sc) && buffer_packing) { m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (t5_fl_pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5); r = t4_read_reg(sc, A_SGE_CONTROL2); if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL2(0x%x)\n", r); rc = EINVAL; } } s->pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack; v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); r = t4_read_reg(sc, A_SGE_HOST_PAGE_SIZE); if (r != v) { device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); rc = EINVAL; } /* Filter out unusable hw buffer sizes entirely (mark with -2). */ hwb = &s->hw_buf_info[0]; for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i)); hwb->size = r; hwb->zidx = hwsz_ok(r) ? -1 : -2; hwb->next = -1; } /* * Create a sorted list in decreasing order of hw buffer sizes (and so * increasing order of spare area) for each software zone. */ n = 0; /* no usable buffer size to begin with */ swz = &s->sw_zone_info[0]; safe_swz = NULL; for (i = 0; i < SW_ZONE_SIZES; i++, swz++) { int8_t head = -1, tail = -1; swz->size = sw_buf_sizes[i]; swz->zone = m_getzone(swz->size); swz->type = m_gettype(swz->size); if (swz->size == safest_rx_cluster) safe_swz = swz; hwb = &s->hw_buf_info[0]; for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { if (hwb->zidx != -1 || hwb->size > swz->size) continue; hwb->zidx = i; if (head == -1) head = tail = j; else if (hwb->size < s->hw_buf_info[tail].size) { s->hw_buf_info[tail].next = j; tail = j; } else { int8_t *cur; struct hw_buf_info *t; for (cur = &head; *cur != -1; cur = &t->next) { t = &s->hw_buf_info[*cur]; if (hwb->size == t->size) { hwb->zidx = -2; break; } if (hwb->size > t->size) { hwb->next = *cur; *cur = j; break; } } } } swz->head_hwidx = head; swz->tail_hwidx = tail; if (tail != -1) { n++; if (swz->size - s->hw_buf_info[tail].size >= CL_METADATA_SIZE) sc->flags |= BUF_PACKING_OK; } } if (n == 0) { device_printf(sc->dev, "no usable SGE FL buffer size.\n"); rc = EINVAL; } s->safe_hwidx1 = -1; s->safe_hwidx2 = -1; if (safe_swz != NULL) { s->safe_hwidx1 = safe_swz->head_hwidx; for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) { int spare; hwb = &s->hw_buf_info[i]; spare = safe_swz->size - hwb->size; if (spare < CL_METADATA_SIZE) continue; if (s->safe_hwidx2 == -1 || spare == CL_METADATA_SIZE + MSIZE) s->safe_hwidx2 = i; if (spare >= CL_METADATA_SIZE + MSIZE) break; } } r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD); s->counter_val[0] = G_THRESHOLD_0(r); s->counter_val[1] = G_THRESHOLD_1(r); s->counter_val[2] = G_THRESHOLD_2(r); s->counter_val[3] = G_THRESHOLD_3(r); r = t4_read_reg(sc, A_SGE_TIMER_VALUE_0_AND_1); s->timer_val[0] = G_TIMERVALUE0(r) / core_ticks_per_usec(sc); s->timer_val[1] = G_TIMERVALUE1(r) / core_ticks_per_usec(sc); r = t4_read_reg(sc, A_SGE_TIMER_VALUE_2_AND_3); s->timer_val[2] = G_TIMERVALUE2(r) / core_ticks_per_usec(sc); s->timer_val[3] = G_TIMERVALUE3(r) / core_ticks_per_usec(sc); r = t4_read_reg(sc, A_SGE_TIMER_VALUE_4_AND_5); s->timer_val[4] = G_TIMERVALUE4(r) / core_ticks_per_usec(sc); s->timer_val[5] = G_TIMERVALUE5(r) / core_ticks_per_usec(sc); if (cong_drop == 0) { m = F_TUNNELCNGDROP0 | F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 | F_TUNNELCNGDROP3; r = t4_read_reg(sc, A_TP_PARA_REG3); if (r & m) { device_printf(sc->dev, "invalid TP_PARA_REG3(0x%x)\n", r); rc = EINVAL; } } v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); if (r != v) { device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); rc = EINVAL; } m = v = F_TDDPTAGTCB; r = t4_read_reg(sc, A_ULP_RX_CTL); if ((r & m) != v) { device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); rc = EINVAL; } m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; r = t4_read_reg(sc, A_TP_PARA_REG5); if ((r & m) != v) { device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); rc = EINVAL; } r = t4_read_reg(sc, A_SGE_CONM_CTRL); s->fl_starve_threshold = G_EGRTHRESHOLD(r) * 2 + 1; if (is_t4(sc)) s->fl_starve_threshold2 = s->fl_starve_threshold; else s->fl_starve_threshold2 = G_EGRTHRESHOLDPACKING(r) * 2 + 1; /* egress queues: log2 of # of doorbells per BAR2 page */ r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF); r >>= S_QUEUESPERPAGEPF0 + (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf; s->eq_s_qpp = r & M_QUEUESPERPAGEPF0; /* ingress queues: log2 of # of doorbells per BAR2 page */ r = t4_read_reg(sc, A_SGE_INGRESS_QUEUES_PER_PAGE_PF); r >>= S_QUEUESPERPAGEPF0 + (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf; s->iq_s_qpp = r & M_QUEUESPERPAGEPF0; t4_init_tp_params(sc); t4_read_mtu_tbl(sc, sc->params.mtus, NULL); t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd); return (rc); } int t4_create_dma_tag(struct adapter *sc) { int rc; rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmat); if (rc != 0) { device_printf(sc->dev, "failed to create main DMA tag: %d\n", rc); } return (rc); } static inline int enable_buffer_packing(struct adapter *sc) { if (sc->flags & BUF_PACKING_OK && ((is_t5(sc) && buffer_packing) || /* 1 or -1 both ok for T5 */ (is_t4(sc) && buffer_packing == 1))) return (1); return (0); } void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A", "freelist buffer sizes"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, NULL, fl_pad, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, spg_len, "status page size (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "buffer_packing", CTLFLAG_RD, NULL, enable_buffer_packing(sc), "pack multiple frames in one fl buffer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)"); } int t4_destroy_dma_tag(struct adapter *sc) { if (sc->dmat) bus_dma_tag_destroy(sc->dmat); return (0); } /* * Allocate and initialize the firmware event queue and the management queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ int t4_setup_adapter_queues(struct adapter *sc) { int rc; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); sysctl_ctx_init(&sc->ctx); sc->flags |= ADAP_SYSCTL_CTX; /* * Firmware event queue */ rc = alloc_fwq(sc); if (rc != 0) return (rc); /* * Management queue. This is just a control queue that uses the fwq as * its associated iq. */ rc = alloc_mgmtq(sc); return (rc); } /* * Idempotent */ int t4_teardown_adapter_queues(struct adapter *sc) { ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* Do this before freeing the queue */ if (sc->flags & ADAP_SYSCTL_CTX) { sysctl_ctx_free(&sc->ctx); sc->flags &= ~ADAP_SYSCTL_CTX; } free_mgmtq(sc); free_fwq(sc); return (0); } static inline int port_intr_count(struct port_info *pi) { int rc = 0; if (pi->flags & INTR_RXQ) rc += pi->nrxq; #ifdef TCP_OFFLOAD if (pi->flags & INTR_OFLD_RXQ) rc += pi->nofldrxq; #endif #ifdef DEV_NETMAP if (pi->flags & INTR_NM_RXQ) rc += pi->nnmrxq; #endif return (rc); } static inline int first_vector(struct port_info *pi) { struct adapter *sc = pi->adapter; int rc = T4_EXTRA_INTR, i; if (sc->intr_count == 1) return (0); for_each_port(sc, i) { if (i == pi->port_id) break; rc += port_intr_count(sc->port[i]); } return (rc); } /* * Given an arbitrary "index," come up with an iq that can be used by other * queues (of this port) for interrupt forwarding, SGE egress updates, etc. * The iq returned is guaranteed to be something that takes direct interrupts. */ static struct sge_iq * port_intr_iq(struct port_info *pi, int idx) { struct adapter *sc = pi->adapter; struct sge *s = &sc->sge; struct sge_iq *iq = NULL; int nintr, i; if (sc->intr_count == 1) return (&sc->sge.fwq); nintr = port_intr_count(pi); KASSERT(nintr != 0, ("%s: pi %p has no exclusive interrupts, total interrupts = %d", __func__, pi, sc->intr_count)); #ifdef DEV_NETMAP /* Exclude netmap queues as they can't take anyone else's interrupts */ if (pi->flags & INTR_NM_RXQ) nintr -= pi->nnmrxq; KASSERT(nintr > 0, ("%s: pi %p has nintr %d after netmap adjustment of %d", __func__, pi, nintr, pi->nnmrxq)); #endif i = idx % nintr; if (pi->flags & INTR_RXQ) { if (i < pi->nrxq) { iq = &s->rxq[pi->first_rxq + i].iq; goto done; } i -= pi->nrxq; } #ifdef TCP_OFFLOAD if (pi->flags & INTR_OFLD_RXQ) { if (i < pi->nofldrxq) { iq = &s->ofld_rxq[pi->first_ofld_rxq + i].iq; goto done; } i -= pi->nofldrxq; } #endif panic("%s: pi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__, pi, pi->flags & INTR_ALL, idx, nintr); done: MPASS(iq != NULL); KASSERT(iq->flags & IQ_INTR, ("%s: iq %p (port %p, intr_flags 0x%lx, idx %d)", __func__, iq, pi, pi->flags & INTR_ALL, idx)); return (iq); } /* Maximum payload that can be delivered with a single iq descriptor */ static inline int mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) { int payload; #ifdef TCP_OFFLOAD if (toe) { payload = sc->tt.rx_coalesce ? G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; } else { #endif /* large enough even when hw VLAN extraction is disabled */ payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu; #ifdef TCP_OFFLOAD } #endif payload = roundup2(payload, fl_pad); return (payload); } int t4_setup_port_queues(struct port_info *pi) { int rc = 0, i, j, intr_idx, iqid; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *ctrlq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif char name[16]; struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); int maxp, pack, mtu = ifp->if_mtu; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = first_vector(pi); /* * First pass over all NIC and TOE rx queues: * a) initialize iq and fl * b) allocate queue iff it will take direct interrupts. */ maxp = mtu_to_max_payload(sc, mtu, 0); pack = enable_buffer_packing(sc); if (pi->flags & INTR_RXQ) { oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); } for_each_rxq(pi, i, rxq) { init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq); snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(pi->dev), i); init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); if (pi->flags & INTR_RXQ) { rxq->iq.flags |= IQ_INTR; rc = alloc_rxq(pi, rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); if (is_offload(sc) && pi->flags & INTR_OFLD_RXQ) { oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); } for_each_ofld_rxq(pi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(pi->dev), i); init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); if (pi->flags & INTR_OFLD_RXQ) { ofld_rxq->iq.flags |= IQ_INTR; rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #endif #ifdef DEV_NETMAP /* * We don't have buffers to back the netmap rx queues right now so we * create the queues in a way that doesn't set off any congestion signal * in the chip. */ if (pi->flags & INTR_NM_RXQ) { oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_rxq", CTLFLAG_RD, NULL, "rx queues for netmap"); for_each_nm_rxq(pi, i, nm_rxq) { rc = alloc_nm_rxq(pi, nm_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #endif /* * Second pass over all NIC and TOE rx queues. The queues forwarding * their interrupts are allocated now. */ j = 0; if (!(pi->flags & INTR_RXQ)) { oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_rxq(pi, i, rxq) { MPASS(!(rxq->iq.flags & IQ_INTR)); intr_idx = port_intr_iq(pi, j)->abs_id; rc = alloc_rxq(pi, rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #ifdef TCP_OFFLOAD if (is_offload(sc) && !(pi->flags & INTR_OFLD_RXQ)) { oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); for_each_ofld_rxq(pi, i, ofld_rxq) { MPASS(!(ofld_rxq->iq.flags & IQ_INTR)); intr_idx = port_intr_iq(pi, j)->abs_id; rc = alloc_ofld_rxq(pi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #endif #ifdef DEV_NETMAP if (!(pi->flags & INTR_NM_RXQ)) CXGBE_UNIMPLEMENTED(__func__); #endif /* * Now the tx queues. Only one pass needed. */ oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD, NULL, "tx queues"); j = 0; for_each_txq(pi, i, txq) { iqid = port_intr_iq(pi, j)->cntxt_id; snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(pi->dev), i); init_eq(&txq->eq, EQ_ETH, pi->qsize_txq, pi->tx_chan, iqid, name); rc = alloc_txq(pi, txq, i, oid); if (rc != 0) goto done; j++; } #ifdef TCP_OFFLOAD oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections"); for_each_ofld_txq(pi, i, ofld_txq) { struct sysctl_oid *oid2; iqid = port_intr_iq(pi, j)->cntxt_id; snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(pi->dev), i); init_eq(&ofld_txq->eq, EQ_OFLD, pi->qsize_txq, pi->tx_chan, iqid, name); snprintf(name, sizeof(name), "%d", i); oid2 = SYSCTL_ADD_NODE(&pi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "offload tx queue"); rc = alloc_wrq(sc, pi, ofld_txq, oid2); if (rc != 0) goto done; j++; } #endif #ifdef DEV_NETMAP oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "nm_txq", CTLFLAG_RD, NULL, "tx queues for netmap use"); for_each_nm_txq(pi, i, nm_txq) { iqid = pi->first_nm_rxq + (j % pi->nnmrxq); rc = alloc_nm_txq(pi, nm_txq, iqid, i, oid); if (rc != 0) goto done; j++; } #endif /* * Finally, the control queue. */ oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD, NULL, "ctrl queue"); ctrlq = &sc->sge.ctrlq[pi->port_id]; iqid = port_intr_iq(pi, 0)->cntxt_id; snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(pi->dev)); init_eq(&ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid, name); rc = alloc_wrq(sc, pi, ctrlq, oid); done: if (rc) t4_teardown_port_queues(pi); return (rc); } /* * Idempotent */ int t4_teardown_port_queues(struct port_info *pi) { int i; struct adapter *sc = pi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif /* Do this before freeing the queues */ if (pi->flags & PORT_SYSCTL_CTX) { sysctl_ctx_free(&pi->ctx); pi->flags &= ~PORT_SYSCTL_CTX; } /* * Take down all the tx queues first, as they reference the rx queues * (for egress updates, etc.). */ free_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(pi, i, txq) { free_txq(pi, txq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { free_wrq(sc, ofld_txq); } #endif #ifdef DEV_NETMAP for_each_nm_txq(pi, i, nm_txq) free_nm_txq(pi, nm_txq); #endif /* * Then take down the rx queues that forward their interrupts, as they * reference other rx queues. */ for_each_rxq(pi, i, rxq) { if ((rxq->iq.flags & IQ_INTR) == 0) free_rxq(pi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if ((ofld_rxq->iq.flags & IQ_INTR) == 0) free_ofld_rxq(pi, ofld_rxq); } #endif #ifdef DEV_NETMAP for_each_nm_rxq(pi, i, nm_rxq) free_nm_rxq(pi, nm_rxq); #endif /* * Then take down the rx queues that take direct interrupts. */ for_each_rxq(pi, i, rxq) { if (rxq->iq.flags & IQ_INTR) free_rxq(pi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) free_ofld_rxq(pi, ofld_rxq); } #endif #ifdef DEV_NETMAP CXGBE_UNIMPLEMENTED(__func__); #endif return (0); } /* * Deals with errors and the firmware event queue. All data rx queues forward * their interrupt to the firmware event queue. */ void t4_intr_all(void *arg) { struct adapter *sc = arg; struct sge_iq *fwq = &sc->sge.fwq; t4_intr_err(arg); if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) { service_iq(fwq, 0); atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE); } } /* Deals with error interrupts */ void t4_intr_err(void *arg) { struct adapter *sc = arg; t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); t4_slow_intr_handler(sc); } void t4_intr_evt(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } void t4_intr(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } /* * Deals with anything and everything on the given ingress queue. */ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ struct sge_fl *fl; /* Use iff IQ_HAS_FL */ struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type, refill; uint32_t lq; uint16_t fl_hw_cidx; struct mbuf *m0; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; #endif KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); limit = budget ? budget : iq->qsize / 16; if (iq->flags & IQ_HAS_FL) { fl = &rxq->fl; fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ } else { fl = NULL; fl_hw_cidx = 0; /* to silence gcc warning */ } /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); refill = 0; m0 = NULL; rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: KASSERT(iq->flags & IQ_HAS_FL, ("%s: data for an iq (%p) with no freelist", __func__, iq)); m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto process_iql; refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2; #ifdef T4_PKT_TIMESTAMP /* * 60 bit timestamp for the payload is * *(uint64_t *)m0->m_pktdat. Note that it is * in the leading free-space in the mbuf. The * kernel can clobber it during a pullup, * m_copymdata, etc. You need to make sure that * the mbuf reaches you unmolested if you care * about the timestamp. */ *(uint64_t *)m0->m_pktdat = be64toh(ctrl->u.last_flit) & 0xfffffffffffffff; #endif /* fall through */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); sc->cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: /* * Interrupts should be forwarded only to queues * that are not forwarding their interrupts. * This means service_iq can recurse but only 1 * level deep. */ KASSERT(budget == 0, ("%s: budget %u, rsp_type %u", __func__, budget, rsp_type)); /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. */ if (lq >= 1024) { sc->an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { if (service_iq(q, q->qsize / 16) == 0) { atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { STAILQ_INSERT_TAIL(&iql, q, link); } } break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && sc->lro_timeout != 0) { tcp_lro_flush_inactive(&rxq->lro, &lro_timeout); } #endif if (budget) { if (iq->flags & IQ_HAS_FL) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); } return (EINPROGRESS); } } if (refill) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); fl_hw_cidx = fl->hw_cidx; } } process_iql: if (STAILQ_EMPTY(&iql)) break; /* * Process the head only, and send it to the back of the list if * it's still not done. */ q = STAILQ_FIRST(&iql); STAILQ_REMOVE_HEAD(&iql, link); if (service_iq(q, q->qsize / 8) == 0) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); else STAILQ_INSERT_TAIL(&iql, q, link); } #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { struct lro_ctrl *lro = &rxq->lro; struct lro_entry *l; while (!SLIST_EMPTY(&lro->lro_active)) { l = SLIST_FIRST(&lro->lro_active); SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, l); } } #endif t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); if (iq->flags & IQ_HAS_FL) { int starved; FL_LOCK(fl); starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); } return (0); } static inline int cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0; if (rc) MPASS(cll->region3 >= CL_METADATA_SIZE); return (rc); } static inline struct cluster_metadata * cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll, caddr_t cl) { if (cl_has_metadata(fl, cll)) { struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; return ((struct cluster_metadata *)(cl + swz->size) - 1); } return (NULL); } static void rxb_free(struct mbuf *m, void *arg1, void *arg2) { uma_zone_t zone = arg1; caddr_t cl = arg2; uma_zfree(zone, cl); counter_u64_add(extfree_rels, 1); } /* * The mbuf returned by this function could be allocated from zone_mbuf or * constructed in spare room in the cluster. * * The mbuf carries the payload in one of these ways * a) frame inside the mbuf (mbuf from zone_mbuf) * b) m_cljset (for clusters without metadata) zone_mbuf * c) m_extaddref (cluster with metadata) inline mbuf * d) m_extaddref (cluster with metadata) zone_mbuf */ static struct mbuf * get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags) { struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct cluster_layout *cll = &sd->cll; struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx]; struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl); int len, padded_len; caddr_t payload; len = min(total, hwb->size - fl->rx_offset); padded_len = roundup2(len, fl->buf_boundary); payload = sd->cl + cll->region1 + fl->rx_offset; if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { /* * Copy payload into a freshly allocated mbuf. */ m = flags & M_PKTHDR ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; #ifdef T4_PKT_TIMESTAMP /* Leave room for a timestamp */ m->m_data += 8; #endif /* copy data to mbuf */ bcopy(payload, mtod(m, caddr_t), len); } else if (sd->nmbuf * MSIZE < cll->region1) { /* * There's spare room in the cluster for an mbuf. Create one * and associate it with the payload that's in the cluster. */ MPASS(clm != NULL); m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE); /* No bzero required */ if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA, flags | M_NOFREE)) return (NULL); fl->mbuf_inlined++; m_extaddref(m, payload, padded_len, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { /* * Grab an mbuf from zone_mbuf and associate it with the * payload in the cluster. */ m = flags & M_PKTHDR ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; if (clm != NULL) { m_extaddref(m, payload, padded_len, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { m_cljset(m, sd->cl, swz->type); sd->cl = NULL; /* consumed, not a recycle candidate */ } } if (flags & M_PKTHDR) m->m_pkthdr.len = total; m->m_len = len; if (fl->flags & FL_BUF_PACKING) { fl->rx_offset += padded_len; MPASS(fl->rx_offset <= hwb->size); if (fl->rx_offset < hwb->size) return (m); /* without advancing the cidx */ } if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } fl->rx_offset = 0; return (m); } static struct mbuf * get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf) { struct mbuf *m0, *m, **pnext; u_int len; len = G_RSPD_LEN(len_newbuf); if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(len == fl->m0->m_pkthdr.len); MPASS(fl->remaining < len); m0 = fl->m0; pnext = fl->pnext; len = fl->remaining; fl->flags &= ~FL_BUF_RESUME; goto get_segment; } if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { fl->rx_offset = 0; if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } } /* * Payload starts at rx_offset in the current hw buffer. Its length is * 'len' and it may span multiple hw buffers. */ m0 = get_scatter_segment(sc, fl, len, M_PKTHDR); if (m0 == NULL) return (NULL); len -= m0->m_len; pnext = &m0->m_next; while (len > 0) { get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, len, 0); if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = len; fl->flags |= FL_BUF_RESUME; return (NULL); } *pnext = m; pnext = &m->m_next; len -= m->m_len; } *pnext = NULL; return (m0); } static int t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct sge_rxq *rxq = iq_to_rxq(iq); struct ifnet *ifp = rxq->ifp; const struct cpl_rx_pkt *cpl = (const void *)(rss + 1); #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__, rss->opcode)); m0->m_pkthdr.len -= fl_pktshift; m0->m_len -= fl_pktshift; m0->m_data += fl_pktshift; m0->m_pkthdr.rcvif = ifp; - m0->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m0, M_HASHTYPE_OPAQUE); m0->m_pkthdr.flowid = be32toh(rss->hash_val); if (cpl->csum_calc && !cpl->err_vec) { if (ifp->if_capenable & IFCAP_RXCSUM && cpl->l2info & htobe32(F_RXF_IP)) { m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); rxq->rxcsum++; } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && cpl->l2info & htobe32(F_RXF_IP6)) { m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); rxq->rxcsum++; } if (__predict_false(cpl->ip_frag)) m0->m_pkthdr.csum_data = be16toh(cpl->csum); else m0->m_pkthdr.csum_data = 0xffff; } if (cpl->vlan_ex) { m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); m0->m_flags |= M_VLANTAG; rxq->vlan_extraction++; } #if defined(INET) || defined(INET6) if (cpl->l2info & htobe32(F_RXF_LRO) && iq->flags & IQ_LRO_ENABLED && tcp_lro_rx(lro, m0, 0) == 0) { /* queued for LRO */ } else #endif ifp->if_input(ifp, m0); return (0); } /* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { struct sge_eq *eq = &wrq->eq; int can_reclaim; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(wrq); #ifdef TCP_OFFLOAD KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD || (eq->flags & EQ_TYPEMASK) == EQ_CTRL, ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); #else KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL, ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); #endif if (__predict_true(wr != NULL)) STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); can_reclaim = reclaimable(eq); if (__predict_false(eq->flags & EQ_STALLED)) { if (eq->avail + can_reclaim < tx_resume_threshold(eq)) return; eq->flags &= ~EQ_STALLED; eq->unstalled++; } eq->cidx += can_reclaim; eq->avail += can_reclaim; if (__predict_false(eq->cidx >= eq->cap)) eq->cidx -= eq->cap; while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) { int ndesc; if (__predict_false(wr->wr_len < 0 || wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) { #ifdef INVARIANTS panic("%s: work request with length %d", __func__, wr->wr_len); #endif #ifdef KDB kdb_backtrace(); #endif log(LOG_ERR, "%s: %s work request with length %d", device_get_nameunit(sc->dev), __func__, wr->wr_len); STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); continue; } ndesc = howmany(wr->wr_len, EQ_ESIZE); if (eq->avail < ndesc) { wrq->no_desc++; break; } dst = (void *)&eq->desc[eq->pidx]; copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len); eq->pidx += ndesc; eq->avail -= ndesc; if (__predict_false(eq->pidx >= eq->cap)) eq->pidx -= eq->cap; eq->pending += ndesc; if (eq->pending >= 8) ring_eq_db(sc, eq); wrq->tx_wrs++; STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); if (eq->avail < 8) { can_reclaim = reclaimable(eq); eq->cidx += can_reclaim; eq->avail += can_reclaim; if (__predict_false(eq->cidx >= eq->cap)) eq->cidx -= eq->cap; } } if (eq->pending) ring_eq_db(sc, eq); if (wr != NULL) { eq->flags |= EQ_STALLED; if (callout_pending(&eq->tx_callout) == 0) callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); } } /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */ #define TXPKTS_PKT_HDR ((\ sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + \ sizeof(struct cpl_tx_pkt_core) \ ) / 8) /* Header of a coalesced tx WR, before SGL of first packet (in flits) */ #define TXPKTS_WR_HDR (\ sizeof(struct fw_eth_tx_pkts_wr) / 8 + \ TXPKTS_PKT_HDR) /* Header of a tx WR, before SGL of first packet (in flits) */ #define TXPKT_WR_HDR ((\ sizeof(struct fw_eth_tx_pkt_wr) + \ sizeof(struct cpl_tx_pkt_core) \ ) / 8 ) /* Header of a tx LSO WR, before SGL of first packet (in flits) */ #define TXPKT_LSO_WR_HDR ((\ sizeof(struct fw_eth_tx_pkt_wr) + \ sizeof(struct cpl_tx_pkt_lso_core) + \ sizeof(struct cpl_tx_pkt_core) \ ) / 8 ) int t4_eth_tx(struct ifnet *ifp, struct sge_txq *txq, struct mbuf *m) { struct port_info *pi = (void *)ifp->if_softc; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; struct buf_ring *br = txq->br; struct mbuf *next; int rc, coalescing, can_reclaim; struct txpkts txpkts; struct sgl sgl; TXQ_LOCK_ASSERT_OWNED(txq); KASSERT(m, ("%s: called with nothing to do.", __func__)); KASSERT((eq->flags & EQ_TYPEMASK) == EQ_ETH, ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); prefetch(&eq->desc[eq->pidx]); prefetch(&txq->sdesc[eq->pidx]); txpkts.npkt = 0;/* indicates there's nothing in txpkts */ coalescing = 0; can_reclaim = reclaimable(eq); if (__predict_false(eq->flags & EQ_STALLED)) { if (eq->avail + can_reclaim < tx_resume_threshold(eq)) { txq->m = m; return (0); } eq->flags &= ~EQ_STALLED; eq->unstalled++; } if (__predict_false(eq->flags & EQ_DOOMED)) { m_freem(m); while ((m = buf_ring_dequeue_sc(txq->br)) != NULL) m_freem(m); return (ENETDOWN); } if (eq->avail < 8 && can_reclaim) reclaim_tx_descs(txq, can_reclaim, 32); for (; m; m = next ? next : drbr_dequeue(ifp, br)) { if (eq->avail < 8) break; next = m->m_nextpkt; m->m_nextpkt = NULL; if (next || buf_ring_peek(br)) coalescing = 1; rc = get_pkt_sgl(txq, &m, &sgl, coalescing); if (rc != 0) { if (rc == ENOMEM) { /* Short of resources, suspend tx */ m->m_nextpkt = next; break; } /* * Unrecoverable error for this packet, throw it away * and move on to the next. get_pkt_sgl may already * have freed m (it will be NULL in that case and the * m_freem here is still safe). */ m_freem(m); continue; } if (coalescing && add_to_txpkts(pi, txq, &txpkts, m, &sgl) == 0) { /* Successfully absorbed into txpkts */ write_ulp_cpl_sgl(pi, txq, &txpkts, m, &sgl); goto doorbell; } /* * We weren't coalescing to begin with, or current frame could * not be coalesced (add_to_txpkts flushes txpkts if a frame * given to it can't be coalesced). Either way there should be * nothing in txpkts. */ KASSERT(txpkts.npkt == 0, ("%s: txpkts not empty: %d", __func__, txpkts.npkt)); /* We're sending out individual packets now */ coalescing = 0; if (eq->avail < 8) reclaim_tx_descs(txq, 0, 8); rc = write_txpkt_wr(pi, txq, m, &sgl); if (rc != 0) { /* Short of hardware descriptors, suspend tx */ /* * This is an unlikely but expensive failure. We've * done all the hard work (DMA mappings etc.) and now we * can't send out the packet. What's worse, we have to * spend even more time freeing up everything in sgl. */ txq->no_desc++; free_pkt_sgl(txq, &sgl); m->m_nextpkt = next; break; } ETHER_BPF_MTAP(ifp, m); if (sgl.nsegs == 0) m_freem(m); doorbell: if (eq->pending >= 8) ring_eq_db(sc, eq); can_reclaim = reclaimable(eq); if (can_reclaim >= 32) reclaim_tx_descs(txq, can_reclaim, 64); } if (txpkts.npkt > 0) write_txpkts_wr(txq, &txpkts); /* * m not NULL means there was an error but we haven't thrown it away. * This can happen when we're short of tx descriptors (no_desc) or maybe * even DMA maps (no_dmamap). Either way, a credit flush and reclaim * will get things going again. */ if (m && !(eq->flags & EQ_CRFLUSHED)) { struct tx_sdesc *txsd = &txq->sdesc[eq->pidx]; /* * If EQ_CRFLUSHED is not set then we know we have at least one * available descriptor because any WR that reduces eq->avail to * 0 also sets EQ_CRFLUSHED. */ KASSERT(eq->avail > 0, ("%s: no space for eqflush.", __func__)); txsd->desc_used = 1; txsd->credits = 0; write_eqflush_wr(eq); } txq->m = m; if (eq->pending) ring_eq_db(sc, eq); reclaim_tx_descs(txq, 0, 128); if (eq->flags & EQ_STALLED && callout_pending(&eq->tx_callout) == 0) callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); return (0); } void t4_update_fl_bufsize(struct ifnet *ifp) { struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; int i, maxp, mtu = ifp->if_mtu; maxp = mtu_to_max_payload(sc, mtu, 0); for_each_rxq(pi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(pi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #endif } int can_resume_tx(struct sge_eq *eq) { return (eq->avail + reclaimable(eq) >= tx_resume_threshold(eq)); } static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, int qsize) { KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ ("%s: bad pktc_idx %d", __func__, pktc_idx)); iq->flags = 0; iq->adapter = sc; iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); iq->intr_pktc_idx = SGE_NCOUNTERS - 1; if (pktc_idx >= 0) { iq->intr_params |= F_QINTR_CNT_EN; iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ iq->sidx = iq->qsize - spg_len / IQ_ESIZE; } static inline void init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, int pack, char *name) { fl->qsize = qsize; fl->sidx = qsize - spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (pack) fl->flags |= FL_BUF_PACKING; find_best_refill_source(sc, fl, maxp); find_safe_refill_source(sc, fl); } static inline void init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan, uint16_t iqid, char *name) { KASSERT(tx_chan < NCHAN, ("%s: bad tx channel %d", __func__, tx_chan)); KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype)); eq->flags = eqtype & EQ_TYPEMASK; eq->tx_chan = tx_chan; eq->iqid = iqid; eq->qsize = qsize; strlcpy(eq->lockname, name, sizeof(eq->lockname)); TASK_INIT(&eq->tx_task, 0, t4_tx_task, eq); callout_init(&eq->tx_callout, CALLOUT_MPSAFE); } static int alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_addr_t *pa, void **va) { int rc; rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc); goto done; } rc = bus_dmamem_alloc(*tag, va, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc); goto done; } rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); if (rc != 0) { device_printf(sc->dev, "cannot load DMA map: %d\n", rc); goto done; } done: if (rc) free_ring(sc, *tag, *map, *pa, *va); return (rc); } static int free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, bus_addr_t pa, void *va) { if (pa) bus_dmamap_unload(tag, map); if (va) bus_dmamem_free(tag, va, map); if (tag) bus_dma_tag_destroy(tag); return (0); } /* * Allocates the ring for an ingress queue and an optional freelist. If the * freelist is specified it will be allocated and then associated with the * ingress queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. * * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then * the intr_idx specifies the vector, starting from 0. Otherwise it specifies * the abs_id of the ingress queue to which its interrupts should be forwarded. */ static int alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, int intr_idx, int cong) { int rc, i, cntxt_id; size_t len; struct fw_iq_cmd c; struct adapter *sc = iq->adapter; __be32 v = 0; len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) return (rc); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); /* Special handling for firmware event queue */ if (iq == &sc->sge.fwq) v |= F_FW_IQ_CMD_IQASYNCH; if (iq->flags & IQ_INTR) { KASSERT(intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, intr_idx)); } else v |= F_FW_IQ_CMD_IQANDST; v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx); c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(pi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); if (cong >= 0) c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) return (rc); /* Allocate space for one software descriptor per buffer. */ rc = alloc_fl_sdesc(fl); if (rc != 0) { device_printf(sc->dev, "failed to setup fl software descriptors: %d\n", rc); return (rc); } if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sc->sge.fl_starve_threshold2, 8); fl->buf_boundary = max(fl_pad, sc->sge.pack_boundary); } else { fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8); fl->buf_boundary = fl_pad; } c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 0)); if (cong >= 0) { c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_64B) | V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B)); c.fl0size = htobe16(fl->qsize); c.fl0addr = htobe64(fl->ba); } rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create ingress queue: %d\n", rc); return (rc); } iq->cidx = 0; iq->gen = F_RSPD_GEN; iq->intr_next = iq->intr_params; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); iq->flags |= IQ_ALLOCATED; cntxt_id = iq->cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.niq) { panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.niq - 1); } sc->sge.iqmap[cntxt_id] = iq; if (fl) { u_int qid; iq->flags |= IQ_HAS_FL; fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = 0; cntxt_id = fl->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) { panic("%s: fl->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); } sc->sge.eqmap[cntxt_id] = (void *)fl; qid = fl->cntxt_id; if (isset(&sc->doorbells, DOORBELL_UDB)) { uint32_t s_qpp = sc->sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (qid >> s_qpp) << PAGE_SHIFT; qid &= mask; if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { udb += qid << UDBS_SEG_SHIFT; qid = 0; } fl->udb = (volatile void *)udb; } fl->dbval = F_DBPRIO | V_QID(qid); if (is_t5(sc)) fl->dbval |= F_DBTYPE; FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); } if (is_t5(sc) && cong >= 0) { uint32_t param, val; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(iq->cntxt_id); if (cong == 0) val = 1 << 19; else { val = 2 << 19; for (i = 0; i < 4; i++) { if (cong & (1 << i)) val |= 1 << (i << 2); } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* report error but carry on */ device_printf(sc->dev, "failed to set congestion manager context for " "ingress queue %d: %d\n", iq->cntxt_id, rc); } } /* Enable IQ interrupts */ atomic_store_rel_int(&iq->state, IQS_IDLE); t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) | V_INGRESSQID(iq->cntxt_id)); return (0); } static int free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl) { int rc; struct adapter *sc = iq->adapter; device_t dev; if (sc == NULL) return (0); /* nothing to do */ dev = pi ? pi->dev : sc->dev; if (iq->flags & IQ_ALLOCATED) { rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); if (rc != 0) { device_printf(dev, "failed to free queue %p: %d\n", iq, rc); return (rc); } iq->flags &= ~IQ_ALLOCATED; } free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); bzero(iq, sizeof(*iq)); if (fl) { free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); if (fl->sdesc) free_fl_sdesc(sc, fl); if (mtx_initialized(&fl->fl_lock)) mtx_destroy(&fl->fl_lock); bzero(fl, sizeof(*fl)); } return (0); } static void add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_fl *fl) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); } SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 0, "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated", CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined", CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); } static int alloc_fwq(struct adapter *sc) { int rc, intr_idx; struct sge_iq *fwq = &sc->sge.fwq; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE); fwq->flags |= IQ_INTR; /* always */ intr_idx = sc->intr_count > 1 ? 1 : 0; rc = alloc_iq_fl(sc->port[0], fwq, NULL, intr_idx, -1); if (rc != 0) { device_printf(sc->dev, "failed to create firmware event queue: %d\n", rc); return (rc); } oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD, NULL, "firmware event queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I", "consumer index"); return (0); } static int free_fwq(struct adapter *sc) { return free_iq_fl(NULL, &sc->sge.fwq, NULL); } static int alloc_mgmtq(struct adapter *sc) { int rc; struct sge_wrq *mgmtq = &sc->sge.mgmtq; char name[16]; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD, NULL, "management queue"); snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev)); init_eq(&mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan, sc->sge.fwq.cntxt_id, name); rc = alloc_wrq(sc, NULL, mgmtq, oid); if (rc != 0) { device_printf(sc->dev, "failed to create management queue: %d\n", rc); return (rc); } return (0); } static int free_mgmtq(struct adapter *sc) { return free_wrq(sc, &sc->sge.mgmtq); } static inline int tnl_cong(struct port_info *pi) { if (cong_drop == -1) return (-1); else if (cong_drop == 1) return (0); else return (pi->rx_chan_map); } static int alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(pi)); if (rc != 0) return (rc); /* * The freelist is just barely above the starvation threshold right now, * fill it up a bit more. */ FL_LOCK(&rxq->fl); refill_fl(pi->adapter, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); #if defined(INET) || defined(INET6) rc = tcp_lro_init(&rxq->lro); if (rc != 0) return (rc); rxq->lro.ifp = pi->ifp; /* also indicates LRO init'ed */ if (pi->ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; #endif rxq->ifp = pi->ifp; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); #if defined(INET) || defined(INET6) SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &rxq->lro.lro_queued, 0, NULL); SYSCTL_ADD_INT(&pi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); add_fl_sysctls(&pi->ctx, oid, &rxq->fl); return (rc); } static int free_rxq(struct port_info *pi, struct sge_rxq *rxq) { int rc; #if defined(INET) || defined(INET6) if (rxq->lro.ifp) { tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; } #endif rc = free_iq_fl(pi, &rxq->iq, &rxq->fl); if (rc == 0) bzero(rxq, sizeof(*rxq)); return (rc); } #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, pi->rx_chan_map); if (rc != 0) return (rc); children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); add_fl_sysctls(&pi->ctx, oid, &ofld_rxq->fl); return (rc); } static int free_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq) { int rc; rc = free_iq_fl(pi, &ofld_rxq->iq, &ofld_rxq->fl); if (rc == 0) bzero(ofld_rxq, sizeof(*ofld_rxq)); return (rc); } #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; struct sysctl_ctx_list *ctx; char name[16]; size_t len; struct adapter *sc = pi->adapter; struct netmap_adapter *na = NA(pi->nm_ifp); MPASS(na != NULL); len = pi->qsize_rxq * IQ_ESIZE; rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map, &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc); if (rc != 0) return (rc); len = na->num_rx_desc * EQ_ESIZE + spg_len; rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map, &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc); if (rc != 0) return (rc); nm_rxq->pi = pi; nm_rxq->nid = idx; nm_rxq->iq_cidx = 0; nm_rxq->iq_sidx = pi->qsize_rxq - spg_len / IQ_ESIZE; nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; nm_rxq->fl_sidx = na->num_rx_desc; nm_rxq->intr_idx = intr_idx; ctx = &pi->ctx; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I", "consumer index"); children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_rxq->fl_cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &nm_rxq->fl_pidx, 0, "producer index"); return (rc); } static int free_nm_rxq(struct port_info *pi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = pi->adapter; free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba, nm_rxq->iq_desc); free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba, nm_rxq->fl_desc); return (0); } static int alloc_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq, int iqidx, int idx, struct sysctl_oid *oid) { int rc; size_t len; struct adapter *sc = pi->adapter; struct netmap_adapter *na = NA(pi->nm_ifp); char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); len = na->num_tx_desc * EQ_ESIZE + spg_len; rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map, &nm_txq->ba, (void **)&nm_txq->desc); if (rc) return (rc); nm_txq->pidx = nm_txq->cidx = 0; nm_txq->sidx = na->num_tx_desc; nm_txq->nid = idx; nm_txq->iqidx = iqidx; nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "netmap tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_txq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I", "producer index"); return (rc); } static int free_nm_txq(struct port_info *pi, struct sge_nm_txq *nm_txq) { struct adapter *sc = pi->adapter; free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba, nm_txq->desc); return (0); } #endif static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); /* XXX */ c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) | V_FW_EQ_CTRL_CMD_EQSIZE(eq->qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create control queue %d: %d\n", eq->tx_chan, rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } static int eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) | V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(pi->dev, "failed to create Ethernet egress queue: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; bzero(&c, sizeof(c)); c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | V_FW_EQ_OFLD_CMD_VFN(0)); c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_OFLD_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) | V_FW_EQ_OFLD_CMD_EQSIZE(eq->qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(pi->dev, "failed to create egress queue for TCP offload: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #endif static int alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { int rc; size_t len; mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); len = eq->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); eq->cap = eq->qsize - spg_len / EQ_ESIZE; eq->spg = (void *)&eq->desc[eq->cap]; eq->avail = eq->cap - 1; /* one less to avoid cidx = pidx */ eq->pidx = eq->cidx = 0; eq->doorbells = sc->doorbells; switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = ctrl_eq_alloc(sc, eq); break; case EQ_ETH: rc = eth_eq_alloc(sc, pi, eq); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = ofld_eq_alloc(sc, pi, eq); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to allocate egress queue(%d): %d\n", eq->flags & EQ_TYPEMASK, rc); } eq->tx_callout.c_cpu = eq->cntxt_id % mp_ncpus; if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ eq->udb_qid = 0; } eq->udb = (volatile void *)udb; } return (rc); } static int free_eq(struct adapter *sc, struct sge_eq *eq) { int rc; if (eq->flags & EQ_ALLOCATED) { switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; case EQ_ETH: rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to free egress queue (%d): %d\n", eq->flags & EQ_TYPEMASK, rc); return (rc); } eq->flags &= ~EQ_ALLOCATED; } free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); if (mtx_initialized(&eq->eq_lock)) mtx_destroy(&eq->eq_lock); bzero(eq, sizeof(*eq)); return (0); } static int alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq, struct sysctl_oid *oid) { int rc; struct sysctl_ctx_list *ctx = pi ? &pi->ctx : &sc->ctx; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = alloc_eq(sc, pi, &wrq->eq); if (rc) return (rc); wrq->adapter = sc; STAILQ_INIT(&wrq->wr_list); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs", CTLFLAG_RD, &wrq->tx_wrs, "# of work requests"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD, &wrq->no_desc, 0, "# of times queue ran out of hardware descriptors"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD, &wrq->eq.unstalled, 0, "# of times queue recovered after stall"); return (rc); } static int free_wrq(struct adapter *sc, struct sge_wrq *wrq) { int rc; rc = free_eq(sc, &wrq->eq); if (rc) return (rc); bzero(wrq, sizeof(*wrq)); return (0); } static int alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx, struct sysctl_oid *oid) { int rc; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = alloc_eq(sc, pi, eq); if (rc) return (rc); txq->ifp = pi->ifp; txq->sdesc = malloc(eq->cap * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); txq->br = buf_ring_alloc(eq->qsize, M_CXGBE, M_WAITOK, &eq->eq_lock); rc = bus_dma_tag_create(sc->dmat, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, 64 * 1024, TX_SGL_SEGS, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &txq->tx_tag); if (rc != 0) { device_printf(sc->dev, "failed to create tx DMA tag: %d\n", rc); return (rc); } /* * We can stuff ~10 frames in an 8-descriptor txpkts WR (8 is the SGE * limit for any WR). txq->no_dmamap events shouldn't occur if maps is * sized for the worst case. */ rc = t4_alloc_tx_maps(&txq->txmaps, txq->tx_tag, eq->qsize * 10 / 8, M_WAITOK); if (rc != 0) { device_printf(sc->dev, "failed to setup tx DMA maps: %d\n", rc); return (rc); } snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &eq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, &txq->txcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, &txq->tso_wrs, "# of TSO work requests"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, &txq->imm_wrs, "# of work requests with immediate data"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_wrs", CTLFLAG_RD, &txq->txpkts_wrs, "# of txpkts work requests (multiple pkts/WR)"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_pkts", CTLFLAG_RD, &txq->txpkts_pkts, "# of frames tx'd using txpkts work requests"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "br_drops", CTLFLAG_RD, &txq->br->br_drops, "# of drops in the buf_ring for this queue"); SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_dmamap", CTLFLAG_RD, &txq->no_dmamap, 0, "# of times txq ran out of DMA maps"); SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD, &txq->no_desc, 0, "# of times txq ran out of hardware descriptors"); SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "egr_update", CTLFLAG_RD, &eq->egr_update, 0, "egress update notifications from the SGE"); SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD, &eq->unstalled, 0, "# of times txq recovered after stall"); return (rc); } static int free_txq(struct port_info *pi, struct sge_txq *txq) { int rc; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; rc = free_eq(sc, eq); if (rc) return (rc); free(txq->sdesc, M_CXGBE); if (txq->txmaps.maps) t4_free_tx_maps(&txq->txmaps, txq->tx_tag); buf_ring_free(txq->br, M_CXGBE); if (txq->tx_tag) bus_dma_tag_destroy(txq->tx_tag); bzero(txq, sizeof(*txq)); return (0); } static void oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *ba = arg; KASSERT(nseg == 1, ("%s meant for single segment mappings only.", __func__)); *ba = error ? 0 : segs->ds_addr; } static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { uint32_t n, v; n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx); MPASS(n > 0); wmb(); v = fl->dbval | V_PIDX(n); if (fl->udb) *fl->udb = htole32(v); else t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v); IDXINCR(fl->dbidx, n, fl->sidx); } /* * Fills up the freelist by allocating upto 'n' buffers. Buffers that are * recycled do not count towards this allocation budget. * * Returns non-zero to indicate that this freelist should be added to the list * of starving freelists. */ static int refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { __be64 *d; struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; struct cluster_layout *cll; struct sw_zone_info *swz; struct cluster_metadata *clm; uint16_t max_pidx; uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); /* * We always stop at the begining of the hardware descriptor that's just * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, * which would mean an empty freelist to the chip. */ max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; if (fl->pidx == max_pidx * 8) return (0); d = &fl->desc[fl->pidx]; sd = &fl->sdesc[fl->pidx]; cll = &fl->cll_def; /* default layout */ swz = &sc->sge.sw_zone_info[cll->zidx]; while (n > 0) { if (sd->cl != NULL) { if (sd->nmbuf == 0) { /* * Fast recycle without involving any atomics on * the cluster's metadata (if the cluster has * metadata). This happens when all frames * received in the cluster were small enough to * fit within a single mbuf each. */ fl->cl_fast_recycled++; #ifdef INVARIANTS clm = cl_metadata(sc, fl, &sd->cll, sd->cl); if (clm != NULL) MPASS(clm->refcount == 1); #endif goto recycled_fast; } /* * Cluster is guaranteed to have metadata. Clusters * without metadata always take the fast recycle path * when they're recycled. */ clm = cl_metadata(sc, fl, &sd->cll, sd->cl); MPASS(clm != NULL); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { fl->cl_recycled++; counter_u64_add(extfree_rels, 1); goto recycled; } sd->cl = NULL; /* gave up my reference */ } MPASS(sd->cl == NULL); alloc: cl = uma_zalloc(swz->zone, M_NOWAIT); if (__predict_false(cl == NULL)) { if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 || fl->cll_def.zidx == fl->cll_alt.zidx) break; /* fall back to the safe zone */ cll = &fl->cll_alt; swz = &sc->sge.sw_zone_info[cll->zidx]; goto alloc; } fl->cl_allocated++; n--; pa = pmap_kextract((vm_offset_t)cl); pa += cll->region1; sd->cl = cl; sd->cll = *cll; *d = htobe64(pa | cll->hwidx); clm = cl_metadata(sc, fl, cll, cl); if (clm != NULL) { recycled: #ifdef INVARIANTS clm->sd = sd; #endif clm->refcount = 1; } sd->nmbuf = 0; recycled_fast: d++; sd++; if (__predict_false(++fl->pidx % 8 == 0)) { uint16_t pidx = fl->pidx / 8; if (__predict_false(pidx == fl->sidx)) { fl->pidx = 0; pidx = 0; sd = fl->sdesc; d = fl->desc; } if (pidx == max_pidx) break; if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) ring_fl_db(sc, fl); } } if (fl->pidx / 8 != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); } /* * Attempt to refill all starving freelists. */ static void refill_sfl(void *arg) { struct adapter *sc = arg; struct sge_fl *fl, *fl_temp; mtx_lock(&sc->sfl_lock); TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { FL_LOCK(fl); refill_fl(sc, fl, 64); if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { TAILQ_REMOVE(&sc->sfl, fl, link); fl->flags &= ~FL_STARVING; } FL_UNLOCK(fl); } if (!TAILQ_EMPTY(&sc->sfl)) callout_schedule(&sc->sfl_callout, hz / 5); mtx_unlock(&sc->sfl_lock); } static int alloc_fl_sdesc(struct sge_fl *fl) { fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); return (0); } static void free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; struct cluster_metadata *clm; struct cluster_layout *cll; int i; sd = fl->sdesc; for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; cll = &sd->cll; clm = cl_metadata(sc, fl, cll, sd->cl); if (sd->nmbuf == 0) uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) { uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); counter_u64_add(extfree_rels, 1); } sd->cl = NULL; } free(fl->sdesc, M_CXGBE); fl->sdesc = NULL; } int t4_alloc_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag, int count, int flags) { struct tx_map *txm; int i, rc; txmaps->map_total = txmaps->map_avail = count; txmaps->map_cidx = txmaps->map_pidx = 0; txmaps->maps = malloc(count * sizeof(struct tx_map), M_CXGBE, M_ZERO | flags); txm = txmaps->maps; for (i = 0; i < count; i++, txm++) { rc = bus_dmamap_create(tx_tag, 0, &txm->map); if (rc != 0) goto failed; } return (0); failed: while (--i >= 0) { txm--; bus_dmamap_destroy(tx_tag, txm->map); } KASSERT(txm == txmaps->maps, ("%s: EDOOFUS", __func__)); free(txmaps->maps, M_CXGBE); txmaps->maps = NULL; return (rc); } void t4_free_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag) { struct tx_map *txm; int i; txm = txmaps->maps; for (i = 0; i < txmaps->map_total; i++, txm++) { if (txm->m) { bus_dmamap_unload(tx_tag, txm->map); m_freem(txm->m); txm->m = NULL; } bus_dmamap_destroy(tx_tag, txm->map); } free(txmaps->maps, M_CXGBE); txmaps->maps = NULL; } /* * We'll do immediate data tx for non-TSO, but only when not coalescing. We're * willing to use upto 2 hardware descriptors which means a maximum of 96 bytes * of immediate data. */ #define IMM_LEN ( \ 2 * EQ_ESIZE \ - sizeof(struct fw_eth_tx_pkt_wr) \ - sizeof(struct cpl_tx_pkt_core)) /* * Returns non-zero on failure, no need to cleanup anything in that case. * * Note 1: We always try to defrag the mbuf if required and return EFBIG only * if the resulting chain still won't fit in a tx descriptor. * * Note 2: We'll pullup the mbuf chain if TSO is requested and the first mbuf * does not have the TCP header in it. */ static int get_pkt_sgl(struct sge_txq *txq, struct mbuf **fp, struct sgl *sgl, int sgl_only) { struct mbuf *m = *fp; struct tx_maps *txmaps; struct tx_map *txm; int rc, defragged = 0, n; TXQ_LOCK_ASSERT_OWNED(txq); if (m->m_pkthdr.tso_segsz) sgl_only = 1; /* Do not allow immediate data with LSO */ start: sgl->nsegs = 0; if (m->m_pkthdr.len <= IMM_LEN && !sgl_only) return (0); /* nsegs = 0 tells caller to use imm. tx */ txmaps = &txq->txmaps; if (txmaps->map_avail == 0) { txq->no_dmamap++; return (ENOMEM); } txm = &txmaps->maps[txmaps->map_pidx]; if (m->m_pkthdr.tso_segsz && m->m_len < 50) { *fp = m_pullup(m, 50); m = *fp; if (m == NULL) return (ENOBUFS); } rc = bus_dmamap_load_mbuf_sg(txq->tx_tag, txm->map, m, sgl->seg, &sgl->nsegs, BUS_DMA_NOWAIT); if (rc == EFBIG && defragged == 0) { m = m_defrag(m, M_NOWAIT); if (m == NULL) return (EFBIG); defragged = 1; *fp = m; goto start; } if (rc != 0) return (rc); txm->m = m; txmaps->map_avail--; if (++txmaps->map_pidx == txmaps->map_total) txmaps->map_pidx = 0; KASSERT(sgl->nsegs > 0 && sgl->nsegs <= TX_SGL_SEGS, ("%s: bad DMA mapping (%d segments)", __func__, sgl->nsegs)); /* * Store the # of flits required to hold this frame's SGL in nflits. An * SGL has a (ULPTX header + len0, addr0) tuple optionally followed by * multiple (len0 + len1, addr0, addr1) tuples. If addr1 is not used * then len1 must be set to 0. */ n = sgl->nsegs - 1; sgl->nflits = (3 * n) / 2 + (n & 1) + 2; return (0); } /* * Releases all the txq resources used up in the specified sgl. */ static int free_pkt_sgl(struct sge_txq *txq, struct sgl *sgl) { struct tx_maps *txmaps; struct tx_map *txm; TXQ_LOCK_ASSERT_OWNED(txq); if (sgl->nsegs == 0) return (0); /* didn't use any map */ txmaps = &txq->txmaps; /* 1 pkt uses exactly 1 map, back it out */ txmaps->map_avail++; if (txmaps->map_pidx > 0) txmaps->map_pidx--; else txmaps->map_pidx = txmaps->map_total - 1; txm = &txmaps->maps[txmaps->map_pidx]; bus_dmamap_unload(txq->tx_tag, txm->map); txm->m = NULL; return (0); } static int write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m, struct sgl *sgl) { struct sge_eq *eq = &txq->eq; struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int nflits, ndesc, pktlen; struct tx_sdesc *txsd; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); pktlen = m->m_pkthdr.len; /* * Do we have enough flits to send this frame out? */ ctrl = sizeof(struct cpl_tx_pkt_core); if (m->m_pkthdr.tso_segsz) { nflits = TXPKT_LSO_WR_HDR; ctrl += sizeof(struct cpl_tx_pkt_lso_core); } else nflits = TXPKT_WR_HDR; if (sgl->nsegs > 0) nflits += sgl->nflits; else { nflits += howmany(pktlen, 8); ctrl += pktlen; } ndesc = howmany(nflits, 8); if (ndesc > eq->avail) return (ENOMEM); /* Firmware work request header */ wr = (void *)&eq->desc[eq->pidx]; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(howmany(nflits, 2)); if (eq->avail == ndesc) { if (!(eq->flags & EQ_CRFLUSHED)) { ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ; eq->flags |= EQ_CRFLUSHED; } eq->flags |= EQ_STALLED; } wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; if (m->m_pkthdr.tso_segsz) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); struct ether_header *eh; void *l3hdr; #if defined(INET) || defined(INET6) struct tcphdr *tcp; #endif uint16_t eh_type; ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; ctrl |= V_LSO_ETHHDR_LEN(1); l3hdr = evh + 1; eh_type = ntohs(evh->evl_proto); } else l3hdr = eh + 1; switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = l3hdr; /* * XXX-BZ For now we do not pretend to support * IPv6 extension headers. */ KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO " "with ip6_nxt != TCP: %u", __func__, ip6->ip6_nxt)); tcp = (struct tcphdr *)(ip6 + 1); ctrl |= F_LSO_IPV6; ctrl |= V_LSO_IPHDR_LEN(sizeof(*ip6) >> 2) | V_LSO_TCPHDR_LEN(tcp->th_off); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; tcp = (void *)((uintptr_t)ip + ip->ip_hl * 4); ctrl |= V_LSO_IPHDR_LEN(ip->ip_hl) | V_LSO_TCPHDR_LEN(tcp->th_off); break; } #endif default: panic("%s: CSUM_TSO but no supported IP version " "(0x%04x)", __func__, eh_type); } lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); txq->tso_wrs++; } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = 0; if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (m->m_flags & M_VLANTAG) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf)); cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* Software descriptor */ txsd = &txq->sdesc[eq->pidx]; txsd->desc_used = ndesc; eq->pending += ndesc; eq->avail -= ndesc; eq->pidx += ndesc; if (eq->pidx >= eq->cap) eq->pidx -= eq->cap; /* SGL */ dst = (void *)(cpl + 1); if (sgl->nsegs > 0) { txsd->credits = 1; txq->sgl_wrs++; write_sgl_to_txd(eq, sgl, &dst); } else { txsd->credits = 0; txq->imm_wrs++; for (; m; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; #endif } #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif } txq->txpkt_wrs++; return (0); } /* * Returns 0 to indicate that m has been accepted into a coalesced tx work * request. It has either been folded into txpkts or txpkts was flushed and m * has started a new coalesced work request (as the first frame in a fresh * txpkts). * * Returns non-zero to indicate a failure - caller is responsible for * transmitting m, if there was anything in txpkts it has been flushed. */ static int add_to_txpkts(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl) { struct sge_eq *eq = &txq->eq; int can_coalesce; struct tx_sdesc *txsd; int flits; TXQ_LOCK_ASSERT_OWNED(txq); KASSERT(sgl->nsegs, ("%s: can't coalesce imm data", __func__)); if (txpkts->npkt > 0) { flits = TXPKTS_PKT_HDR + sgl->nflits; can_coalesce = m->m_pkthdr.tso_segsz == 0 && txpkts->nflits + flits <= TX_WR_FLITS && txpkts->nflits + flits <= eq->avail * 8 && txpkts->plen + m->m_pkthdr.len < 65536; if (can_coalesce) { txpkts->npkt++; txpkts->nflits += flits; txpkts->plen += m->m_pkthdr.len; txsd = &txq->sdesc[eq->pidx]; txsd->credits++; return (0); } /* * Couldn't coalesce m into txpkts. The first order of business * is to send txpkts on its way. Then we'll revisit m. */ write_txpkts_wr(txq, txpkts); } /* * Check if we can start a new coalesced tx work request with m as * the first packet in it. */ KASSERT(txpkts->npkt == 0, ("%s: txpkts not empty", __func__)); flits = TXPKTS_WR_HDR + sgl->nflits; can_coalesce = m->m_pkthdr.tso_segsz == 0 && flits <= eq->avail * 8 && flits <= TX_WR_FLITS; if (can_coalesce == 0) return (EINVAL); /* * Start a fresh coalesced tx WR with m as the first frame in it. */ txpkts->npkt = 1; txpkts->nflits = flits; txpkts->flitp = &eq->desc[eq->pidx].flit[2]; txpkts->plen = m->m_pkthdr.len; txsd = &txq->sdesc[eq->pidx]; txsd->credits = 1; return (0); } /* * Note that write_txpkts_wr can never run out of hardware descriptors (but * write_txpkt_wr can). add_to_txpkts ensures that a frame is accepted for * coalescing only if sufficient hardware descriptors are available. */ static void write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts) { struct sge_eq *eq = &txq->eq; struct fw_eth_tx_pkts_wr *wr; struct tx_sdesc *txsd; uint32_t ctrl; int ndesc; TXQ_LOCK_ASSERT_OWNED(txq); ndesc = howmany(txpkts->nflits, 8); wr = (void *)&eq->desc[eq->pidx]; wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2)); if (eq->avail == ndesc) { if (!(eq->flags & EQ_CRFLUSHED)) { ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ; eq->flags |= EQ_CRFLUSHED; } eq->flags |= EQ_STALLED; } wr->equiq_to_len16 = htobe32(ctrl); wr->plen = htobe16(txpkts->plen); wr->npkt = txpkts->npkt; wr->r3 = wr->type = 0; /* Everything else already written */ txsd = &txq->sdesc[eq->pidx]; txsd->desc_used = ndesc; KASSERT(eq->avail >= ndesc, ("%s: out of descriptors", __func__)); eq->pending += ndesc; eq->avail -= ndesc; eq->pidx += ndesc; if (eq->pidx >= eq->cap) eq->pidx -= eq->cap; txq->txpkts_pkts += txpkts->npkt; txq->txpkts_wrs++; txpkts->npkt = 0; /* emptied */ } static inline void write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; struct cpl_tx_pkt_core *cpl; struct sge_eq *eq = &txq->eq; uintptr_t flitp, start, end; uint64_t ctrl; caddr_t dst; KASSERT(txpkts->npkt > 0, ("%s: txpkts is empty", __func__)); start = (uintptr_t)eq->desc; end = (uintptr_t)eq->spg; /* Checksum offload */ ctrl = 0; if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))) ctrl |= F_TXPKT_IPCSUM_DIS; if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))) ctrl |= F_TXPKT_L4CSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (m->m_flags & M_VLANTAG) { ctrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* * The previous packet's SGL must have ended at a 16 byte boundary (this * is required by the firmware/hardware). It follows that flitp cannot * wrap around between the ULPTX master command and ULPTX subcommand (8 * bytes each), and that it can not wrap around in the middle of the * cpl_tx_pkt_core either. */ flitp = (uintptr_t)txpkts->flitp; KASSERT((flitp & 0xf) == 0, ("%s: last SGL did not end at 16 byte boundary: %p", __func__, txpkts->flitp)); /* ULP master command */ ulpmc = (void *)flitp; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); ulpmc->len = htonl(howmany(sizeof(*ulpmc) + sizeof(*ulpsc) + sizeof(*cpl) + 8 * sgl->nflits, 16)); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); flitp += sizeof(*ulpmc) + sizeof(*ulpsc); if (flitp == end) flitp = start; /* CPL_TX_PKT */ cpl = (void *)flitp; cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf)); cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl); flitp += sizeof(*cpl); if (flitp == end) flitp = start; /* SGL for this frame */ dst = (caddr_t)flitp; txpkts->nflits += write_sgl_to_txd(eq, sgl, &dst); txpkts->flitp = (void *)dst; KASSERT(((uintptr_t)dst & 0xf) == 0, ("%s: SGL ends at %p (not a 16 byte boundary)", __func__, dst)); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. It returns 1 in that case. */ static int write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to) { __be64 *flitp, *end; struct ulptx_sgl *usgl; bus_dma_segment_t *seg; int i, padded; KASSERT(sgl->nsegs > 0 && sgl->nflits > 0, ("%s: bad SGL - nsegs=%d, nflits=%d", __func__, sgl->nsegs, sgl->nflits)); KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); flitp = (__be64 *)(*to); end = flitp + sgl->nflits; seg = &sgl->seg[0]; usgl = (void *)flitp; /* * We start at a 16 byte boundary somewhere inside the tx descriptor * ring, so we're at least 16 bytes away from the status page. There is * no chance of a wrap around in the middle of usgl (which is 16 bytes). */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(sgl->nsegs)); usgl->len0 = htobe32(seg->ds_len); usgl->addr0 = htobe64(seg->ds_addr); seg++; if ((uintptr_t)end <= (uintptr_t)eq->spg) { /* Won't wrap around at all */ for (i = 0; i < sgl->nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ds_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ds_addr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); for (i = 0; i < sgl->nflits - 2; i++) { if ((uintptr_t)flitp == (uintptr_t)eq->spg) flitp = (void *)eq->desc; *flitp++ = get_flit(seg, sgl->nsegs - 1, i); } end = flitp; } if ((uintptr_t)end & 0xf) { *(uint64_t *)end = 0; end++; padded = 1; } else padded = 0; if ((uintptr_t)end == (uintptr_t)eq->spg) *to = (void *)eq->desc; else *to = (void *)end; return (padded); } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) { bcopy(from, *to, len); (*to) += len; } else { int portion = (uintptr_t)eq->spg - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static inline void ring_eq_db(struct adapter *sc, struct sge_eq *eq) { u_int db, pending; db = eq->doorbells; pending = eq->pending; if (pending > 1) clrbit(&db, DOORBELL_WCWR); eq->pending = 0; wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending)); return; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; int i; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(eq->udb_qid == 0 && pending == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", __func__, eq->doorbells, pending, eq->pidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); i = eq->pidx ? eq->pidx - 1 : eq->cap - 1; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); return; } case DOORBELL_UDBWC: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending)); wmb(); return; case DOORBELL_KDB: t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), V_QID(eq->cntxt_id) | V_PIDX(pending)); return; } } static inline int reclaimable(struct sge_eq *eq) { unsigned int cidx; cidx = eq->spg->cidx; /* stable snapshot */ cidx = be16toh(cidx); if (cidx >= eq->cidx) return (cidx - eq->cidx); else return (cidx + eq->cap - eq->cidx); } /* * There are "can_reclaim" tx descriptors ready to be reclaimed. Reclaim as * many as possible but stop when there are around "n" mbufs to free. * * The actual number reclaimed is provided as the return value. */ static int reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n) { struct tx_sdesc *txsd; struct tx_maps *txmaps; struct tx_map *txm; unsigned int reclaimed, maps; struct sge_eq *eq = &txq->eq; TXQ_LOCK_ASSERT_OWNED(txq); if (can_reclaim == 0) can_reclaim = reclaimable(eq); maps = reclaimed = 0; while (can_reclaim && maps < n) { int ndesc; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; /* Firmware doesn't return "partial" credits. */ KASSERT(can_reclaim >= ndesc, ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); maps += txsd->credits; reclaimed += ndesc; can_reclaim -= ndesc; eq->cidx += ndesc; if (__predict_false(eq->cidx >= eq->cap)) eq->cidx -= eq->cap; } txmaps = &txq->txmaps; txm = &txmaps->maps[txmaps->map_cidx]; if (maps) prefetch(txm->m); eq->avail += reclaimed; KASSERT(eq->avail < eq->cap, /* avail tops out at (cap - 1) */ ("%s: too many descriptors available", __func__)); txmaps->map_avail += maps; KASSERT(txmaps->map_avail <= txmaps->map_total, ("%s: too many maps available", __func__)); while (maps--) { struct tx_map *next; next = txm + 1; if (__predict_false(txmaps->map_cidx + 1 == txmaps->map_total)) next = txmaps->maps; prefetch(next->m); bus_dmamap_unload(txq->tx_tag, txm->map); m_freem(txm->m); txm->m = NULL; txm = next; if (__predict_false(++txmaps->map_cidx == txmaps->map_total)) txmaps->map_cidx = 0; } return (reclaimed); } static void write_eqflush_wr(struct sge_eq *eq) { struct fw_eq_flush_wr *wr; EQ_LOCK_ASSERT_OWNED(eq); KASSERT(eq->avail > 0, ("%s: no descriptors left.", __func__)); KASSERT(!(eq->flags & EQ_CRFLUSHED), ("%s: flushed already", __func__)); wr = (void *)&eq->desc[eq->pidx]; bzero(wr, sizeof(*wr)); wr->opcode = FW_EQ_FLUSH_WR; wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(sizeof(*wr) / 16) | F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); eq->flags |= (EQ_CRFLUSHED | EQ_STALLED); eq->pending++; eq->avail--; if (++eq->pidx == eq->cap) eq->pidx = 0; } static __be64 get_flit(bus_dma_segment_t *sgl, int nsegs, int idx) { int i = (idx / 3) * 2; switch (idx % 3) { case 0: { __be64 rc; rc = htobe32(sgl[i].ds_len); if (i + 1 < nsegs) rc |= (uint64_t)htobe32(sgl[i + 1].ds_len) << 32; return (rc); } case 1: return htobe64(sgl[i].ds_addr); case 2: return htobe64(sgl[i + 1].ds_addr); } return (0); } static void find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) { int8_t zidx, hwidx, idx; uint16_t region1, region3; int spare, spare_needed, n; struct sw_zone_info *swz; struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0]; /* * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize * large enough for the max payload and cluster metadata. Otherwise * settle for the largest bufsize that leaves enough room in the cluster * for metadata. * * Without buffer packing: Look for the smallest zone which has a * bufsize large enough for the max payload. Settle for the largest * bufsize available if there's nothing big enough for max payload. */ spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0; swz = &sc->sge.sw_zone_info[0]; hwidx = -1; for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) { if (swz->size > largest_rx_cluster) { if (__predict_true(hwidx != -1)) break; /* * This is a misconfiguration. largest_rx_cluster is * preventing us from finding a refill source. See * dev.t5nex..buffer_sizes to figure out why. */ device_printf(sc->dev, "largest_rx_cluster=%u leaves no" " refill source for fl %p (dma %u). Ignored.\n", largest_rx_cluster, fl, maxp); } for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (spare < spare_needed) continue; hwidx = idx; /* best option so far */ if (hwb->size >= maxp) { if ((fl->flags & FL_BUF_PACKING) == 0) goto done; /* stop looking (not packing) */ if (swz->size >= safest_rx_cluster) goto done; /* stop looking (packing) */ } break; /* keep looking, next zone */ } } done: /* A usable hwidx has been located. */ MPASS(hwidx != -1); hwb = &hwb_list[hwidx]; zidx = hwb->zidx; swz = &sc->sge.sw_zone_info[zidx]; region1 = 0; region3 = swz->size - hwb->size; /* * Stay within this zone and see if there is a better match when mbuf * inlining is allowed. Remember that the hwidx's are sorted in * decreasing order of size (so in increasing order of spare area). */ for (idx = hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) break; if (spare < CL_METADATA_SIZE + MSIZE) continue; n = (spare - CL_METADATA_SIZE) / MSIZE; if (n > howmany(hwb->size, maxp)) break; hwidx = idx; if (fl->flags & FL_BUF_PACKING) { region1 = n * MSIZE; region3 = spare - region1; } else { region1 = MSIZE; region3 = spare - region1; break; } } KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES, ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp)); KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES, ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp)); KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 == sc->sge.sw_zone_info[zidx].size, ("%s: bad buffer layout for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); if (fl->flags & FL_BUF_PACKING || region1 > 0) { KASSERT(region3 >= CL_METADATA_SIZE, ("%s: no room for metadata. fl %p, maxp %d; " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); KASSERT(region1 % MSIZE == 0, ("%s: bad mbuf region for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); } fl->cll_def.zidx = zidx; fl->cll_def.hwidx = hwidx; fl->cll_def.region1 = region1; fl->cll_def.region3 = region3; } static void find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) { struct sge *s = &sc->sge; struct hw_buf_info *hwb; struct sw_zone_info *swz; int spare; int8_t hwidx; if (fl->flags & FL_BUF_PACKING) hwidx = s->safe_hwidx2; /* with room for metadata */ else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) { hwidx = s->safe_hwidx2; hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; /* no good if there isn't room for an mbuf as well */ if (spare < CL_METADATA_SIZE + MSIZE) hwidx = s->safe_hwidx1; } else hwidx = s->safe_hwidx1; if (hwidx == -1) { /* No fallback source */ fl->cll_alt.hwidx = -1; fl->cll_alt.zidx = -1; return; } hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; fl->cll_alt.hwidx = hwidx; fl->cll_alt.zidx = hwb->zidx; if (allow_mbufs_in_cluster) fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else fl->cll_alt.region1 = 0; fl->cll_alt.region3 = spare - fl->cll_alt.region1; } static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); if ((fl->flags & FL_DOOMED) == 0) { fl->flags |= FL_STARVING; TAILQ_INSERT_TAIL(&sc->sfl, fl, link); callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); } FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); } static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start]; EQ_LOCK(eq); KASSERT(eq->flags & EQ_CRFLUSHED, ("%s: unsolicited egress update", __func__)); eq->flags &= ~EQ_CRFLUSHED; eq->egr_update++; if (__predict_false(eq->flags & EQ_DOOMED)) wakeup_one(eq); else if (eq->flags & EQ_STALLED && can_resume_tx(eq)) taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task); EQ_UNLOCK(eq); return (0); } /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ offsetof(struct cpl_fw6_msg, data)); static int handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { const struct rss_header *rss2; rss2 = (const struct rss_header *)&cpl->data[0]; return (sc->cpl_handler[rss2->opcode](iq, rss2, m)); } return (sc->fw_msg_handler[cpl->type](sc, &cpl->data[0])); } static int sysctl_uint16(SYSCTL_HANDLER_ARGS) { uint16_t *id = arg1; int i = *id; return sysctl_handle_int(oidp, &i, 0, req); } static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS) { struct sge *s = arg1; struct hw_buf_info *hwb = &s->hw_buf_info[0]; struct sw_zone_info *swz = &s->sw_zone_info[0]; int i, rc; struct sbuf sb; char c; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster) c = '*'; else c = '\0'; sbuf_printf(&sb, "%u%c ", hwb->size, c); } sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } Index: head/sys/dev/e1000/if_igb.c =================================================================== --- head/sys/dev/e1000/if_igb.c (revision 275357) +++ head/sys/dev/e1000/if_igb.c (revision 275358) @@ -1,6421 +1,6420 @@ /****************************************************************************** Copyright (c) 2001-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #include "opt_altq.h" #endif #include #include #ifndef IGB_LEGACY_TX #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #include #include #include "e1000_api.h" #include "e1000_82575.h" #include "if_igb.h" /********************************************************************* * Set this to one to display debug statistics *********************************************************************/ int igb_display_debug_stats = 0; /********************************************************************* * Driver version: *********************************************************************/ char igb_driver_version[] = "version - 2.4.0"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into e1000_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static igb_vendor_info_t igb_vendor_info_array[] = { { 0x8086, E1000_DEV_ID_82575EB_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82575EB_FIBER_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82575GB_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_NS, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_NS_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_SERDES_QUAD, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_QUAD_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_QUAD_COPPER_ET2, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82576_VF, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_COPPER_DUAL, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_82580_QUAD_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SFP, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_BACKPLANE, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I350_VF, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_COPPER_IT, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_COPPER_OEM1, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_COPPER_FLASHLESS, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_SERDES_FLASHLESS, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I210_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I211_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I354_BACKPLANE_1GBPS, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I354_BACKPLANE_2_5GBPS, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_I354_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, /* required last entry */ { 0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings for all supported NICs. *********************************************************************/ static char *igb_strings[] = { "Intel(R) PRO/1000 Network Connection" }; /********************************************************************* * Function prototypes *********************************************************************/ static int igb_per_unit_num_queues(SYSCTL_HANDLER_ARGS); static int igb_probe(device_t); static int igb_attach(device_t); static int igb_detach(device_t); static int igb_shutdown(device_t); static int igb_suspend(device_t); static int igb_resume(device_t); #ifndef IGB_LEGACY_TX static int igb_mq_start(struct ifnet *, struct mbuf *); static int igb_mq_start_locked(struct ifnet *, struct tx_ring *); static void igb_qflush(struct ifnet *); static void igb_deferred_mq_start(void *, int); #else static void igb_start(struct ifnet *); static void igb_start_locked(struct tx_ring *, struct ifnet *ifp); #endif static int igb_ioctl(struct ifnet *, u_long, caddr_t); static uint64_t igb_get_counter(if_t, ift_counter); static void igb_init(void *); static void igb_init_locked(struct adapter *); static void igb_stop(void *); static void igb_media_status(struct ifnet *, struct ifmediareq *); static int igb_media_change(struct ifnet *); static void igb_identify_hardware(struct adapter *); static int igb_allocate_pci_resources(struct adapter *); static int igb_allocate_msix(struct adapter *); static int igb_allocate_legacy(struct adapter *); static int igb_setup_msix(struct adapter *); static void igb_free_pci_resources(struct adapter *); static void igb_local_timer(void *); static void igb_reset(struct adapter *); static int igb_setup_interface(device_t, struct adapter *); static int igb_allocate_queues(struct adapter *); static void igb_configure_queues(struct adapter *); static int igb_allocate_transmit_buffers(struct tx_ring *); static void igb_setup_transmit_structures(struct adapter *); static void igb_setup_transmit_ring(struct tx_ring *); static void igb_initialize_transmit_units(struct adapter *); static void igb_free_transmit_structures(struct adapter *); static void igb_free_transmit_buffers(struct tx_ring *); static int igb_allocate_receive_buffers(struct rx_ring *); static int igb_setup_receive_structures(struct adapter *); static int igb_setup_receive_ring(struct rx_ring *); static void igb_initialize_receive_units(struct adapter *); static void igb_free_receive_structures(struct adapter *); static void igb_free_receive_buffers(struct rx_ring *); static void igb_free_receive_ring(struct rx_ring *); static void igb_enable_intr(struct adapter *); static void igb_disable_intr(struct adapter *); static void igb_update_stats_counters(struct adapter *); static bool igb_txeof(struct tx_ring *); static __inline void igb_rx_discard(struct rx_ring *, int); static __inline void igb_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u32); static bool igb_rxeof(struct igb_queue *, int, int *); static void igb_rx_checksum(u32, struct mbuf *, u32); static int igb_tx_ctx_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static int igb_tso_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static void igb_set_promisc(struct adapter *); static void igb_disable_promisc(struct adapter *); static void igb_set_multi(struct adapter *); static void igb_update_link_status(struct adapter *); static void igb_refresh_mbufs(struct rx_ring *, int); static void igb_register_vlan(void *, struct ifnet *, u16); static void igb_unregister_vlan(void *, struct ifnet *, u16); static void igb_setup_vlan_hw_support(struct adapter *); static int igb_xmit(struct tx_ring *, struct mbuf **); static int igb_dma_malloc(struct adapter *, bus_size_t, struct igb_dma_alloc *, int); static void igb_dma_free(struct adapter *, struct igb_dma_alloc *); static int igb_sysctl_nvm_info(SYSCTL_HANDLER_ARGS); static void igb_print_nvm_info(struct adapter *); static int igb_is_valid_ether_addr(u8 *); static void igb_add_hw_stats(struct adapter *); static void igb_vf_init_stats(struct adapter *); static void igb_update_vf_stats_counters(struct adapter *); /* Management and WOL Support */ static void igb_init_manageability(struct adapter *); static void igb_release_manageability(struct adapter *); static void igb_get_hw_control(struct adapter *); static void igb_release_hw_control(struct adapter *); static void igb_enable_wakeup(device_t); static void igb_led_func(void *, int); static int igb_irq_fast(void *); static void igb_msix_que(void *); static void igb_msix_link(void *); static void igb_handle_que(void *context, int pending); static void igb_handle_link(void *context, int pending); static void igb_handle_link_locked(struct adapter *); static void igb_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); static int igb_set_flowcntl(SYSCTL_HANDLER_ARGS); static int igb_sysctl_dmac(SYSCTL_HANDLER_ARGS); static int igb_sysctl_eee(SYSCTL_HANDLER_ARGS); #ifdef DEVICE_POLLING static poll_handler_t igb_poll; #endif /* POLLING */ /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t igb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, igb_probe), DEVMETHOD(device_attach, igb_attach), DEVMETHOD(device_detach, igb_detach), DEVMETHOD(device_shutdown, igb_shutdown), DEVMETHOD(device_suspend, igb_suspend), DEVMETHOD(device_resume, igb_resume), DEVMETHOD_END }; static driver_t igb_driver = { "igb", igb_methods, sizeof(struct adapter), }; static devclass_t igb_devclass; DRIVER_MODULE(igb, pci, igb_driver, igb_devclass, 0, 0); MODULE_DEPEND(igb, pci, 1, 1, 1); MODULE_DEPEND(igb, ether, 1, 1, 1); /********************************************************************* * Tunable default values. *********************************************************************/ static SYSCTL_NODE(_hw, OID_AUTO, igb, CTLFLAG_RD, 0, "IGB driver parameters"); /* Descriptor defaults */ static int igb_rxd = IGB_DEFAULT_RXD; static int igb_txd = IGB_DEFAULT_TXD; SYSCTL_INT(_hw_igb, OID_AUTO, rxd, CTLFLAG_RDTUN, &igb_rxd, 0, "Number of receive descriptors per queue"); SYSCTL_INT(_hw_igb, OID_AUTO, txd, CTLFLAG_RDTUN, &igb_txd, 0, "Number of transmit descriptors per queue"); /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int igb_enable_aim = TRUE; SYSCTL_INT(_hw_igb, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &igb_enable_aim, 0, "Enable adaptive interrupt moderation"); /* * MSIX should be the default for best performance, * but this allows it to be forced off for testing. */ static int igb_enable_msix = 1; SYSCTL_INT(_hw_igb, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &igb_enable_msix, 0, "Enable MSI-X interrupts"); /* ** Tuneable Interrupt rate */ static int igb_max_interrupt_rate = 8000; SYSCTL_INT(_hw_igb, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, &igb_max_interrupt_rate, 0, "Maximum interrupts per second"); #ifndef IGB_LEGACY_TX /* ** Tuneable number of buffers in the buf-ring (drbr_xxx) */ static int igb_buf_ring_size = IGB_BR_SIZE; SYSCTL_INT(_hw_igb, OID_AUTO, buf_ring_size, CTLFLAG_RDTUN, &igb_buf_ring_size, 0, "Size of the bufring"); #endif /* ** Header split causes the packet header to ** be dma'd to a seperate mbuf from the payload. ** this can have memory alignment benefits. But ** another plus is that small packets often fit ** into the header and thus use no cluster. Its ** a very workload dependent type feature. */ static int igb_header_split = FALSE; SYSCTL_INT(_hw_igb, OID_AUTO, header_split, CTLFLAG_RDTUN, &igb_header_split, 0, "Enable receive mbuf header split"); /* ** This will autoconfigure based on the ** number of CPUs and max supported ** MSIX messages if left at 0. */ static int igb_num_queues = 0; SYSCTL_INT(_hw_igb, OID_AUTO, num_queues, CTLFLAG_RDTUN, &igb_num_queues, 0, "Number of queues to configure, 0 indicates autoconfigure"); /* ** Global variable to store last used CPU when binding queues ** to CPUs in igb_allocate_msix. Starts at CPU_FIRST and increments when a ** queue is bound to a cpu. */ static int igb_last_bind_cpu = -1; /* How many packets rxeof tries to clean at a time */ static int igb_rx_process_limit = 100; SYSCTL_INT(_hw_igb, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &igb_rx_process_limit, 0, "Maximum number of received packets to process at a time, -1 means unlimited"); #ifdef DEV_NETMAP /* see ixgbe.c for details */ #include #endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * * igb_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int igb_probe(device_t dev) { char adapter_name[60]; uint16_t pci_vendor_id = 0; uint16_t pci_device_id = 0; uint16_t pci_subvendor_id = 0; uint16_t pci_subdevice_id = 0; igb_vendor_info_t *ent; INIT_DEBUGOUT("igb_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IGB_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = igb_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == PCI_ANY_ID)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == PCI_ANY_ID))) { sprintf(adapter_name, "%s %s", igb_strings[ent->index], igb_driver_version); device_set_desc_copy(dev, adapter_name); return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int igb_attach(device_t dev) { struct adapter *adapter; int error = 0; u16 eeprom_data; INIT_DEBUGOUT("igb_attach: begin"); if (resource_disabled("igb", device_get_unit(dev))) { device_printf(dev, "Disabled by device hint\n"); return (ENXIO); } adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; IGB_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL stuff */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_nvm_info, "I", "NVM Information"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "num_queues", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, igb_per_unit_num_queues, "I", "Number of Queues"); igb_set_sysctl_value(adapter, "enable_aim", "Interrupt Moderation", &adapter->enable_aim, igb_enable_aim); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fc", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_set_flowcntl, "I", "Flow Control"); callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware and mac info */ igb_identify_hardware(adapter); /* Setup PCI resources */ if (igb_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_pci; } /* Do Shared Code initialization */ if (e1000_setup_init_funcs(&adapter->hw, TRUE)) { device_printf(dev, "Setup of Shared code failed\n"); error = ENXIO; goto err_pci; } e1000_get_bus_info(&adapter->hw); /* Sysctl for limiting the amount of work done in the taskqueue */ igb_set_sysctl_value(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, igb_rx_process_limit); /* * Validate number of transmit and receive descriptors. It * must not exceed hardware maximum, and must be multiple * of E1000_DBA_ALIGN. */ if (((igb_txd * sizeof(struct e1000_tx_desc)) % IGB_DBA_ALIGN) != 0 || (igb_txd > IGB_MAX_TXD) || (igb_txd < IGB_MIN_TXD)) { device_printf(dev, "Using %d TX descriptors instead of %d!\n", IGB_DEFAULT_TXD, igb_txd); adapter->num_tx_desc = IGB_DEFAULT_TXD; } else adapter->num_tx_desc = igb_txd; if (((igb_rxd * sizeof(struct e1000_rx_desc)) % IGB_DBA_ALIGN) != 0 || (igb_rxd > IGB_MAX_RXD) || (igb_rxd < IGB_MIN_RXD)) { device_printf(dev, "Using %d RX descriptors instead of %d!\n", IGB_DEFAULT_RXD, igb_rxd); adapter->num_rx_desc = IGB_DEFAULT_RXD; } else adapter->num_rx_desc = igb_rxd; adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_wait_to_complete = FALSE; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; /* Copper options */ if (adapter->hw.phy.media_type == e1000_media_type_copper) { adapter->hw.phy.mdix = AUTO_ALL_MODES; adapter->hw.phy.disable_polarity_correction = FALSE; adapter->hw.phy.ms_type = IGB_MASTER_SLAVE; } /* * Set the frame limits assuming * standard ethernet sized frames. */ adapter->max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHERNET_FCS_SIZE; /* ** Allocate and Setup Queues */ if (igb_allocate_queues(adapter)) { error = ENOMEM; goto err_pci; } /* Allocate the appropriate stats memory */ if (adapter->vf_ifp) { adapter->stats = (struct e1000_vf_stats *)malloc(sizeof \ (struct e1000_vf_stats), M_DEVBUF, M_NOWAIT | M_ZERO); igb_vf_init_stats(adapter); } else adapter->stats = (struct e1000_hw_stats *)malloc(sizeof \ (struct e1000_hw_stats), M_DEVBUF, M_NOWAIT | M_ZERO); if (adapter->stats == NULL) { device_printf(dev, "Can not allocate stats memory\n"); error = ENOMEM; goto err_late; } /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(u8) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_late; } /* Some adapter-specific advanced features */ if (adapter->hw.mac.type >= e1000_i350) { SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "dmac", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_dmac, "I", "DMA Coalesce"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "eee_disabled", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_eee, "I", "Disable Energy Efficient Ethernet"); if (adapter->hw.phy.media_type == e1000_media_type_copper) { if (adapter->hw.mac.type == e1000_i354) e1000_set_eee_i354(&adapter->hw); else e1000_set_eee_i350(&adapter->hw); } } /* ** Start from a known state, this is ** important in reading the nvm and ** mac from that. */ e1000_reset_hw(&adapter->hw); /* Make sure we have a good EEPROM before we read from it */ if (((adapter->hw.mac.type != e1000_i210) && (adapter->hw.mac.type != e1000_i211)) && (e1000_validate_nvm_checksum(&adapter->hw) < 0)) { /* ** Some PCI-E parts fail the first check due to ** the link being in sleep state, call it again, ** if it fails a second time its a real issue. */ if (e1000_validate_nvm_checksum(&adapter->hw) < 0) { device_printf(dev, "The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_late; } } /* ** Copy the permanent MAC address out of the EEPROM */ if (e1000_read_mac_addr(&adapter->hw) < 0) { device_printf(dev, "EEPROM read error while reading MAC" " address\n"); error = EIO; goto err_late; } /* Check its sanity */ if (!igb_is_valid_ether_addr(adapter->hw.mac.addr)) { device_printf(dev, "Invalid MAC address\n"); error = EIO; goto err_late; } /* Setup OS specific network interface */ if (igb_setup_interface(dev, adapter) != 0) goto err_late; /* Now get a good starting state */ igb_reset(adapter); /* Initialize statistics */ igb_update_stats_counters(adapter); adapter->hw.mac.get_link_status = 1; igb_update_link_status(adapter); /* Indicate SOL/IDER usage */ if (e1000_check_reset_block(&adapter->hw)) device_printf(dev, "PHY reset is blocked due to SOL/IDER session.\n"); /* Determine if we have to control management hardware */ adapter->has_manage = e1000_enable_mng_pass_thru(&adapter->hw); /* * Setup Wake-on-Lan */ /* APME bit in EEPROM is mapped to WUC.APME */ eeprom_data = E1000_READ_REG(&adapter->hw, E1000_WUC) & E1000_WUC_APME; if (eeprom_data) adapter->wol = E1000_WUFC_MAG; /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, igb_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, igb_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); igb_add_hw_stats(adapter); /* Tell the stack that the interface is not active */ adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->ifp->if_drv_flags |= IFF_DRV_OACTIVE; adapter->led_dev = led_create(igb_led_func, adapter, device_get_nameunit(dev)); /* ** Configure Interrupts */ if ((adapter->msix > 1) && (igb_enable_msix)) error = igb_allocate_msix(adapter); else /* MSI or Legacy */ error = igb_allocate_legacy(adapter); if (error) goto err_late; #ifdef DEV_NETMAP igb_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("igb_attach: end"); return (0); err_late: igb_detach(dev); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); igb_release_hw_control(adapter); err_pci: igb_free_pci_resources(adapter); if (adapter->ifp != NULL) if_free(adapter->ifp); free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int igb_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; INIT_DEBUGOUT("igb_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } ether_ifdetach(adapter->ifp); if (adapter->led_dev != NULL) led_destroy(adapter->led_dev); #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) ether_poll_deregister(ifp); #endif IGB_CORE_LOCK(adapter); adapter->in_detach = 1; igb_stop(adapter); IGB_CORE_UNLOCK(adapter); e1000_phy_hw_reset(&adapter->hw); /* Give control back to firmware */ igb_release_manageability(adapter); igb_release_hw_control(adapter); if (adapter->wol) { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); igb_enable_wakeup(dev); } /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(adapter->ifp); #endif /* DEV_NETMAP */ igb_free_pci_resources(adapter); bus_generic_detach(dev); if_free(ifp); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); if (adapter->mta != NULL) free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int igb_shutdown(device_t dev) { return igb_suspend(dev); } /* * Suspend/resume device methods. */ static int igb_suspend(device_t dev) { struct adapter *adapter = device_get_softc(dev); IGB_CORE_LOCK(adapter); igb_stop(adapter); igb_release_manageability(adapter); igb_release_hw_control(adapter); if (adapter->wol) { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); igb_enable_wakeup(dev); } IGB_CORE_UNLOCK(adapter); return bus_generic_suspend(dev); } static int igb_resume(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; IGB_CORE_LOCK(adapter); igb_init_locked(adapter); igb_init_manageability(adapter); if ((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } } IGB_CORE_UNLOCK(adapter); return bus_generic_resume(dev); } #ifdef IGB_LEGACY_TX /********************************************************************* * Transmit entry point * * igb_start is called by the stack to initiate a transmit. * The driver will remain in this routine as long as there are * packets to transmit and transmit resources are available. * In case resources are not available stack is notified and * the packet is requeued. **********************************************************************/ static void igb_start_locked(struct tx_ring *txr, struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct mbuf *m_head; IGB_TX_LOCK_ASSERT(txr); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!adapter->link_active) return; /* Call cleanup if number of TX descriptors low */ if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) igb_txeof(txr); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if (txr->tx_avail <= IGB_MAX_SCATTER) { txr->queue_status |= IGB_QUEUE_DEPLETED; break; } IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (igb_xmit(txr, &m_head)) { if (m_head != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m_head); if (txr->tx_avail <= IGB_MAX_SCATTER) txr->queue_status |= IGB_QUEUE_DEPLETED; break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set watchdog on */ txr->watchdog_time = ticks; txr->queue_status |= IGB_QUEUE_WORKING; } } /* * Legacy TX driver routine, called from the * stack, always uses tx[0], and spins for it. * Should not be used with multiqueue tx */ static void igb_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IGB_TX_LOCK(txr); igb_start_locked(txr, ifp); IGB_TX_UNLOCK(txr); } return; } #else /* ~IGB_LEGACY_TX */ /* ** Multiqueue Transmit Entry: ** quick turnaround to the stack ** */ static int igb_mq_start(struct ifnet *ifp, struct mbuf *m) { struct adapter *adapter = ifp->if_softc; struct igb_queue *que; struct tx_ring *txr; int i, err = 0; #ifdef RSS uint32_t bucket_id; #endif /* Which queue to use */ /* * When doing RSS, map it to the same outbound queue * as the incoming flow would be mapped to. * * If everything is setup correctly, it should be the * same bucket that the current CPU we're on is. */ - if ((m->m_flags & M_FLOWID) != 0) { + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bucket_id) == 0) { /* XXX TODO: spit out something if bucket_id > num_queues? */ i = bucket_id % adapter->num_queues; } else { #endif i = m->m_pkthdr.flowid % adapter->num_queues; #ifdef RSS } #endif } else { i = curcpu % adapter->num_queues; } txr = &adapter->tx_rings[i]; que = &adapter->queues[i]; err = drbr_enqueue(ifp, txr->br, m); if (err) return (err); if (IGB_TX_TRYLOCK(txr)) { igb_mq_start_locked(ifp, txr); IGB_TX_UNLOCK(txr); } else taskqueue_enqueue(que->tq, &txr->txq_task); return (0); } static int igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct mbuf *next; int err = 0, enq = 0; IGB_TX_LOCK_ASSERT(txr); if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || adapter->link_active == 0) return (ENETDOWN); /* Process the queue */ while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = igb_xmit(txr, &next)) != 0) { if (next == NULL) { /* It was freed, move forward */ drbr_advance(ifp, txr->br); } else { /* * Still have one left, it may not be * the same since the transmit function * may have changed it. */ drbr_putback(ifp, txr->br, next); } break; } drbr_advance(ifp, txr->br); enq++; if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); if (next->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (enq > 0) { /* Set the watchdog */ txr->queue_status |= IGB_QUEUE_WORKING; txr->watchdog_time = ticks; } if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) igb_txeof(txr); if (txr->tx_avail <= IGB_MAX_SCATTER) txr->queue_status |= IGB_QUEUE_DEPLETED; return (err); } /* * Called from a taskqueue to drain queued transmit packets. */ static void igb_deferred_mq_start(void *arg, int pending) { struct tx_ring *txr = arg; struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; IGB_TX_LOCK(txr); if (!drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); IGB_TX_UNLOCK(txr); } /* ** Flush all ring buffers */ static void igb_qflush(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct mbuf *m; for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IGB_TX_UNLOCK(txr); } if_qflush(ifp); } #endif /* ~IGB_LEGACY_TX */ /********************************************************************* * Ioctl entry point * * igb_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = FALSE; int error = 0; if (adapter->in_detach) return (error); switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) igb_init(adapter); #ifdef INET if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else error = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: { int max_frame_size; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)"); IGB_CORE_LOCK(adapter); max_frame_size = 9234; if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN - ETHER_CRC_LEN) { IGB_CORE_UNLOCK(adapter); error = EINVAL; break; } ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); break; } case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl rcv'd:\ SIOCSIFFLAGS (Set Interface Flags)"); IGB_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { igb_disable_promisc(adapter); igb_set_promisc(adapter); } } else igb_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) igb_stop(adapter); adapter->if_flags = ifp->if_flags; IGB_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl rcv'd: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); igb_set_multi(adapter); #ifdef DEVICE_POLLING if (!(ifp->if_capenable & IFCAP_POLLING)) #endif igb_enable_intr(adapter); IGB_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: /* Check SOL/IDER usage */ IGB_CORE_LOCK(adapter); if (e1000_check_reset_block(&adapter->hw)) { IGB_CORE_UNLOCK(adapter); device_printf(adapter->dev, "Media change is" " blocked due to SOL/IDER session.\n"); break; } IGB_CORE_UNLOCK(adapter); case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl rcv'd: \ SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask, reinit; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFCAP (Set Capabilities)"); reinit = 0; mask = ifr->ifr_reqcap ^ ifp->if_capenable; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { if (ifr->ifr_reqcap & IFCAP_POLLING) { error = ether_poll_register(igb_poll, ifp); if (error) return (error); IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); ifp->if_capenable |= IFCAP_POLLING; IGB_CORE_UNLOCK(adapter); } else { error = ether_poll_deregister(ifp); /* Enable interrupt even in error case */ IGB_CORE_LOCK(adapter); igb_enable_intr(adapter); ifp->if_capenable &= ~IFCAP_POLLING; IGB_CORE_UNLOCK(adapter); } } #endif if (mask & IFCAP_HWCSUM) { ifp->if_capenable ^= IFCAP_HWCSUM; reinit = 1; } if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; reinit = 1; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; reinit = 1; } if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; reinit = 1; } if (mask & IFCAP_VLAN_HWFILTER) { ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; reinit = 1; } if (mask & IFCAP_VLAN_HWTSO) { ifp->if_capenable ^= IFCAP_VLAN_HWTSO; reinit = 1; } if (mask & IFCAP_LRO) { ifp->if_capenable ^= IFCAP_LRO; reinit = 1; } if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) igb_init(adapter); VLAN_CAPABILITIES(ifp); break; } default: error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ static void igb_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; INIT_DEBUGOUT("igb_init: begin"); IGB_CORE_LOCK_ASSERT(adapter); igb_disable_intr(adapter); callout_stop(&adapter->timer); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), adapter->hw.mac.addr, ETHER_ADDR_LEN); /* Put the address into the Receive Address Array */ e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); igb_reset(adapter); igb_update_link_status(adapter); E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); /* Set hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 if (adapter->hw.mac.type == e1000_82576) ifp->if_hwassist |= CSUM_SCTP; #endif } if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; /* Configure for OS presence */ igb_init_manageability(adapter); /* Prepare transmit descriptors and buffers */ igb_setup_transmit_structures(adapter); igb_initialize_transmit_units(adapter); /* Setup Multicast table */ igb_set_multi(adapter); /* ** Figure out the desired mbuf pool ** for doing jumbo/packetsplit */ if (adapter->max_frame_size <= 2048) adapter->rx_mbuf_sz = MCLBYTES; else if (adapter->max_frame_size <= 4096) adapter->rx_mbuf_sz = MJUMPAGESIZE; else adapter->rx_mbuf_sz = MJUM9BYTES; /* Prepare receive descriptors and buffers */ if (igb_setup_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); return; } igb_initialize_receive_units(adapter); /* Enable VLAN support */ if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) igb_setup_vlan_hw_support(adapter); /* Don't lose promiscuous settings */ igb_set_promisc(adapter); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; callout_reset(&adapter->timer, hz, igb_local_timer, adapter); e1000_clear_hw_cntrs_base_generic(&adapter->hw); if (adapter->msix > 1) /* Set up queue routing */ igb_configure_queues(adapter); /* this clears any pending interrupts */ E1000_READ_REG(&adapter->hw, E1000_ICR); #ifdef DEVICE_POLLING /* * Only enable interrupts if we are not polling, make sure * they are off otherwise. */ if (ifp->if_capenable & IFCAP_POLLING) igb_disable_intr(adapter); else #endif /* DEVICE_POLLING */ { igb_enable_intr(adapter); E1000_WRITE_REG(&adapter->hw, E1000_ICS, E1000_ICS_LSC); } /* Set Energy Efficient Ethernet */ if (adapter->hw.phy.media_type == e1000_media_type_copper) { if (adapter->hw.mac.type == e1000_i354) e1000_set_eee_i354(&adapter->hw); else e1000_set_eee_i350(&adapter->hw); } } static void igb_init(void *arg) { struct adapter *adapter = arg; IGB_CORE_LOCK(adapter); igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_handle_que(void *context, int pending) { struct igb_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bool more; more = igb_rxeof(que, adapter->rx_process_limit, NULL); IGB_TX_LOCK(txr); igb_txeof(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); /* Do we need another? */ if (more) { taskqueue_enqueue(que->tq, &que->que_task); return; } } #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) return; #endif /* Reenable this interrupt */ if (que->eims) E1000_WRITE_REG(&adapter->hw, E1000_EIMS, que->eims); else igb_enable_intr(adapter); } /* Deal with link in a sleepable context */ static void igb_handle_link(void *context, int pending) { struct adapter *adapter = context; IGB_CORE_LOCK(adapter); igb_handle_link_locked(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_handle_link_locked(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; IGB_CORE_LOCK_ASSERT(adapter); adapter->hw.mac.get_link_status = 1; igb_update_link_status(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } } } /********************************************************************* * * MSI/Legacy Deferred * Interrupt Service routine * *********************************************************************/ static int igb_irq_fast(void *arg) { struct adapter *adapter = arg; struct igb_queue *que = adapter->queues; u32 reg_icr; reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Hot eject? */ if (reg_icr == 0xffffffff) return FILTER_STRAY; /* Definitely not our interrupt. */ if (reg_icr == 0x0) return FILTER_STRAY; if ((reg_icr & E1000_ICR_INT_ASSERTED) == 0) return FILTER_STRAY; /* * Mask interrupts until the taskqueue is finished running. This is * cheap, just assume that it is needed. This also works around the * MSI message reordering errata on certain systems. */ igb_disable_intr(adapter); taskqueue_enqueue(que->tq, &que->que_task); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) taskqueue_enqueue(que->tq, &adapter->link_task); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; return FILTER_HANDLED; } #ifdef DEVICE_POLLING #if __FreeBSD_version >= 800000 #define POLL_RETURN_COUNT(a) (a) static int #else #define POLL_RETURN_COUNT(a) static void #endif igb_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct adapter *adapter = ifp->if_softc; struct igb_queue *que; struct tx_ring *txr; u32 reg_icr, rx_done = 0; u32 loop = IGB_MAX_LOOP; bool more; IGB_CORE_LOCK(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { IGB_CORE_UNLOCK(adapter); return POLL_RETURN_COUNT(rx_done); } if (cmd == POLL_AND_CHECK_STATUS) { reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) igb_handle_link_locked(adapter); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; } IGB_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; txr = que->txr; igb_rxeof(que, count, &rx_done); IGB_TX_LOCK(txr); do { more = igb_txeof(txr); } while (loop-- && more); #ifndef IGB_LEGACY_TX if (!drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } return POLL_RETURN_COUNT(rx_done); } #endif /* DEVICE_POLLING */ /********************************************************************* * * MSIX Que Interrupt Service routine * **********************************************************************/ static void igb_msix_que(void *arg) { struct igb_queue *que = arg; struct adapter *adapter = que->adapter; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; u32 newitr = 0; bool more_rx; /* Ignore spurious interrupts */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; E1000_WRITE_REG(&adapter->hw, E1000_EIMC, que->eims); ++que->irqs; IGB_TX_LOCK(txr); igb_txeof(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); more_rx = igb_rxeof(que, adapter->rx_process_limit, NULL); if (adapter->enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) E1000_WRITE_REG(&adapter->hw, E1000_EITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; /* Used half Default if sub-gig */ if (adapter->link_speed != 1000) newitr = IGB_DEFAULT_ITR / 2; else { if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); } newitr &= 0x7FFC; /* Mask invalid bits */ if (adapter->hw.mac.type == e1000_82575) newitr |= newitr << 16; else newitr |= E1000_EITR_CNT_IGNR; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: /* Schedule a clean task if needed*/ if (more_rx) taskqueue_enqueue(que->tq, &que->que_task); else /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_EIMS, que->eims); return; } /********************************************************************* * * MSIX Link Interrupt Service routine * **********************************************************************/ static void igb_msix_link(void *arg) { struct adapter *adapter = arg; u32 icr; ++adapter->link_irq; icr = E1000_READ_REG(&adapter->hw, E1000_ICR); if (!(icr & E1000_ICR_LSC)) goto spurious; igb_handle_link(adapter, 0); spurious: /* Rearm */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); E1000_WRITE_REG(&adapter->hw, E1000_EIMS, adapter->link_mask); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct adapter *adapter = ifp->if_softc; INIT_DEBUGOUT("igb_media_status: begin"); IGB_CORE_LOCK(adapter); igb_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IGB_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; switch (adapter->link_speed) { case 10: ifmr->ifm_active |= IFM_10_T; break; case 100: /* ** Support for 100Mb SFP - these are Fiber ** but the media type appears as serdes */ if (adapter->hw.phy.media_type == e1000_media_type_internal_serdes) ifmr->ifm_active |= IFM_100_FX; else ifmr->ifm_active |= IFM_100_TX; break; case 1000: ifmr->ifm_active |= IFM_1000_T; break; case 2500: ifmr->ifm_active |= IFM_2500_SX; break; } if (adapter->link_duplex == FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; IGB_CORE_UNLOCK(adapter); } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int igb_media_change(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("igb_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); IGB_CORE_LOCK(adapter); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; break; case IFM_1000_LX: case IFM_1000_SX: case IFM_1000_T: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL; break; case IFM_100_TX: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF; break; case IFM_10_T: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF; break; default: device_printf(adapter->dev, "Unsupported media type\n"); } igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); return (0); } /********************************************************************* * * This routine maps the mbufs to Advanced TX descriptors. * **********************************************************************/ static int igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) { struct adapter *adapter = txr->adapter; u32 olinfo_status = 0, cmd_type_len; int i, j, error, nsegs; int first; bool remap = TRUE; struct mbuf *m_head; bus_dma_segment_t segs[IGB_MAX_SCATTER]; bus_dmamap_t map; struct igb_tx_buf *txbuf; union e1000_adv_tx_desc *txd = NULL; m_head = *m_headp; /* Basic descriptor defines */ cmd_type_len = (E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT); if (m_head->m_flags & M_VLANTAG) cmd_type_len |= E1000_ADVTXD_DCMD_VLE; /* * Important to capture the first descriptor * used because it will contain the index of * the one we tell the hardware to report back */ first = txr->next_avail_desc; txbuf = &txr->tx_buffers[first]; map = txbuf->map; /* * Map the packet for DMA. */ retry: error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (__predict_false(error)) { struct mbuf *m; switch (error) { case EFBIG: /* Try it again? - one try */ if (remap == TRUE) { remap = FALSE; m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; goto retry; } else return (error); case ENOMEM: txr->no_tx_dma_setup++; return (error); default: txr->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } /* Make certain there are enough descriptors */ if (nsegs > txr->tx_avail - 2) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); } m_head = *m_headp; /* ** Set up the appropriate offload context ** this will consume the first descriptor */ error = igb_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status); if (__predict_false(error)) { m_freem(*m_headp); *m_headp = NULL; return (error); } /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) olinfo_status |= txr->me << 4; i = txr->next_avail_desc; for (j = 0; j < nsegs; j++) { bus_size_t seglen; bus_addr_t segaddr; txbuf = &txr->tx_buffers[i]; txd = &txr->tx_base[i]; seglen = segs[j].ds_len; segaddr = htole64(segs[j].ds_addr); txd->read.buffer_addr = segaddr; txd->read.cmd_type_len = htole32(E1000_TXD_CMD_IFCS | cmd_type_len | seglen); txd->read.olinfo_status = htole32(olinfo_status); if (++i == txr->num_desc) i = 0; } txd->read.cmd_type_len |= htole32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS); txr->tx_avail -= nsegs; txr->next_avail_desc = i; txbuf->m_head = m_head; /* ** Here we swap the map so the last descriptor, ** which gets the completion interrupt has the ** real map, and the first descriptor gets the ** unused map from this descriptor. */ txr->tx_buffers[first].map = txbuf->map; txbuf->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); /* Set the EOP descriptor that will be marked done */ txbuf = &txr->tx_buffers[first]; txbuf->eop = txd; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * Advance the Transmit Descriptor Tail (Tdt), this tells the * hardware that this frame is available to transmit. */ ++txr->total_packets; E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), i); return (0); } static void igb_set_promisc(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u32 reg; if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_enabled); return; } reg = E1000_READ_REG(hw, E1000_RCTL); if (ifp->if_flags & IFF_PROMISC) { reg |= (E1000_RCTL_UPE | E1000_RCTL_MPE); E1000_WRITE_REG(hw, E1000_RCTL, reg); } else if (ifp->if_flags & IFF_ALLMULTI) { reg |= E1000_RCTL_MPE; reg &= ~E1000_RCTL_UPE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } } static void igb_disable_promisc(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 reg; int mcnt = 0; if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_disabled); return; } reg = E1000_READ_REG(hw, E1000_RCTL); reg &= (~E1000_RCTL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } /* Don't disable if in MAX groups */ if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg &= (~E1000_RCTL_MPE); E1000_WRITE_REG(hw, E1000_RCTL, reg); } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ static void igb_set_multi(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ifmultiaddr *ifma; u32 reg_rctl = 0; u8 *mta; int mcnt = 0; IOCTL_DEBUGOUT("igb_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(uint8_t) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl |= E1000_RCTL_MPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } else e1000_update_mc_addr_list(&adapter->hw, mta, mcnt); } /********************************************************************* * Timer routine: * This routine checks for link status, * updates statistics, and does the watchdog. * **********************************************************************/ static void igb_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; struct igb_queue *que = adapter->queues; int hung = 0, busy = 0; IGB_CORE_LOCK_ASSERT(adapter); igb_update_link_status(adapter); igb_update_stats_counters(adapter); /* ** Check the TX queues status ** - central locked handling of OACTIVE ** - watchdog only if all queues show hung */ for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if ((txr->queue_status & IGB_QUEUE_HUNG) && (adapter->pause_frames == 0)) ++hung; if (txr->queue_status & IGB_QUEUE_DEPLETED) ++busy; if ((txr->queue_status & IGB_QUEUE_IDLE) == 0) taskqueue_enqueue(que->tq, &que->que_task); } if (hung == adapter->num_queues) goto timeout; if (busy == adapter->num_queues) ifp->if_drv_flags |= IFF_DRV_OACTIVE; else if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) && (busy < adapter->num_queues)) ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; adapter->pause_frames = 0; callout_reset(&adapter->timer, hz, igb_local_timer, adapter); #ifndef DEVICE_POLLING /* Schedule all queue interrupts - deadlock protection */ E1000_WRITE_REG(&adapter->hw, E1000_EICS, adapter->que_mask); #endif return; timeout: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); device_printf(dev,"Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, E1000_READ_REG(&adapter->hw, E1000_TDH(txr->me)), E1000_READ_REG(&adapter->hw, E1000_TDT(txr->me))); device_printf(dev,"TX(%d) desc avail = %d," "Next TX to Clean = %d\n", txr->me, txr->tx_avail, txr->next_to_clean); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; igb_init_locked(adapter); } static void igb_update_link_status(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_fc_info *fc = &hw->fc; struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; u32 link_check, thstat, ctrl; char *flowctl = NULL; link_check = thstat = ctrl = 0; /* Get the cached link value or read for real */ switch (hw->phy.media_type) { case e1000_media_type_copper: if (hw->mac.get_link_status) { /* Do the work to read phy */ e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; } else link_check = TRUE; break; case e1000_media_type_fiber: e1000_check_for_link(hw); link_check = (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU); break; case e1000_media_type_internal_serdes: e1000_check_for_link(hw); link_check = adapter->hw.mac.serdes_has_link; break; /* VF device is type_unknown */ case e1000_media_type_unknown: e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; /* Fall thru */ default: break; } /* Check for thermal downshift or shutdown */ if (hw->mac.type == e1000_i350) { thstat = E1000_READ_REG(hw, E1000_THSTAT); ctrl = E1000_READ_REG(hw, E1000_CTRL_EXT); } /* Get the flow control for display */ switch (fc->current_mode) { case e1000_fc_rx_pause: flowctl = "RX"; break; case e1000_fc_tx_pause: flowctl = "TX"; break; case e1000_fc_full: flowctl = "Full"; break; case e1000_fc_none: default: flowctl = "None"; break; } /* Now we check if a transition has happened */ if (link_check && (adapter->link_active == 0)) { e1000_get_speed_and_duplex(&adapter->hw, &adapter->link_speed, &adapter->link_duplex); if (bootverbose) device_printf(dev, "Link is up %d Mbps %s," " Flow Control: %s\n", adapter->link_speed, ((adapter->link_duplex == FULL_DUPLEX) ? "Full Duplex" : "Half Duplex"), flowctl); adapter->link_active = 1; ifp->if_baudrate = adapter->link_speed * 1000000; if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && (thstat & E1000_THSTAT_LINK_THROTTLE)) device_printf(dev, "Link: thermal downshift\n"); /* Delay Link Up for Phy update */ if (((hw->mac.type == e1000_i210) || (hw->mac.type == e1000_i211)) && (hw->phy.id == I210_I_PHY_ID)) msec_delay(I210_LINK_DELAY); /* Reset if the media type changed. */ if (hw->dev_spec._82575.media_changed) { hw->dev_spec._82575.media_changed = false; adapter->flags |= IGB_MEDIA_RESET; igb_reset(adapter); } /* This can sleep */ if_link_state_change(ifp, LINK_STATE_UP); } else if (!link_check && (adapter->link_active == 1)) { ifp->if_baudrate = adapter->link_speed = 0; adapter->link_duplex = 0; if (bootverbose) device_printf(dev, "Link is Down\n"); if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && (thstat & E1000_THSTAT_PWR_DOWN)) device_printf(dev, "Link: thermal shutdown\n"); adapter->link_active = 0; /* This can sleep */ if_link_state_change(ifp, LINK_STATE_DOWN); /* Reset queue state */ for (int i = 0; i < adapter->num_queues; i++, txr++) txr->queue_status = IGB_QUEUE_IDLE; } } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void igb_stop(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; IGB_CORE_LOCK_ASSERT(adapter); INIT_DEBUGOUT("igb_stop: begin"); igb_disable_intr(adapter); callout_stop(&adapter->timer); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* Disarm watchdog timer. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); txr->queue_status = IGB_QUEUE_IDLE; IGB_TX_UNLOCK(txr); } e1000_reset_hw(&adapter->hw); E1000_WRITE_REG(&adapter->hw, E1000_WUC, 0); e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void igb_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; /* Make sure our PCI config space has the necessary stuff set */ pci_enable_busmaster(dev); adapter->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2); /* Save off the information about this board */ adapter->hw.vendor_id = pci_get_vendor(dev); adapter->hw.device_id = pci_get_device(dev); adapter->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1); adapter->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); adapter->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* Set MAC type early for PCI setup */ e1000_set_mac_type(&adapter->hw); /* Are we a VF device? */ if ((adapter->hw.mac.type == e1000_vfadapt) || (adapter->hw.mac.type == e1000_vfadapt_i350)) adapter->vf_ifp = 1; else adapter->vf_ifp = 0; } static int igb_allocate_pci_resources(struct adapter *adapter) { device_t dev = adapter->dev; int rid; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->pci_mem == NULL) { device_printf(dev, "Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; adapter->num_queues = 1; /* Defaults for Legacy or MSI */ /* This will setup either MSI/X or MSI */ adapter->msix = igb_setup_msix(adapter); adapter->hw.back = &adapter->osdep; return (0); } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ static int igb_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = adapter->queues; #ifndef IGB_LEGACY_TX struct tx_ring *txr = adapter->tx_rings; #endif int error, rid = 0; /* Turn off all interrupts */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); /* MSI RID is 1 */ if (adapter->msix == 1) rid = 1; /* We allocate a single interrupt resource */ adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } #ifndef IGB_LEGACY_TX TASK_INIT(&txr->txq_task, 0, igb_deferred_mq_start, txr); #endif /* * Try allocating a fast interrupt and the associated deferred * processing contexts. */ TASK_INIT(&que->que_task, 0, igb_handle_que, que); /* Make tasklet for deferred link handling */ TASK_INIT(&adapter->link_task, 0, igb_handle_link, adapter); que->tq = taskqueue_create_fast("igb_taskq", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s taskq", device_get_nameunit(adapter->dev)); if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, igb_irq_fast, NULL, adapter, &adapter->tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(que->tq); que->tq = NULL; return (error); } return (0); } /********************************************************************* * * Setup the MSIX Queue Interrupt handlers: * **********************************************************************/ static int igb_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = adapter->queues; int error, rid, vector = 0; int cpu_id = 0; /* Be sure to start with all interrupts disabled */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0); E1000_WRITE_FLUSH(&adapter->hw); #ifdef RSS /* * If we're doing RSS, the number of queues needs to * match the number of RSS buckets that are configured. * * + If there's more queues than RSS buckets, we'll end * up with queues that get no traffic. * * + If there's more RSS buckets than queues, we'll end * up having multiple RSS buckets map to the same queue, * so there'll be some contention. */ if (adapter->num_queues != rss_getnumbuckets()) { device_printf(dev, "%s: number of queues (%d) != number of RSS buckets (%d)" "; performance will be impacted.\n", __func__, adapter->num_queues, rss_getnumbuckets()); } #endif for (int i = 0; i < adapter->num_queues; i++, vector++, que++) { rid = vector +1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "MSIX Queue Interrupt\n"); return (ENXIO); } error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, igb_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register Queue handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "que %d", i); #endif que->msix = vector; if (adapter->hw.mac.type == e1000_82575) que->eims = E1000_EICR_TX_QUEUE0 << i; else que->eims = 1 << vector; #ifdef RSS /* * The queue ID is used as the RSS layer bucket ID. * We look up the queue ID -> RSS CPU ID and select * that. */ cpu_id = rss_getcpu(i % rss_getnumbuckets()); #else /* * Bind the msix vector, and thus the * rings to the corresponding cpu. * * This just happens to match the default RSS round-robin * bucket -> queue -> CPU allocation. */ if (adapter->num_queues > 1) { if (igb_last_bind_cpu < 0) igb_last_bind_cpu = CPU_FIRST(); cpu_id = igb_last_bind_cpu; } #endif if (adapter->num_queues > 1) { bus_bind_intr(dev, que->res, cpu_id); #ifdef RSS device_printf(dev, "Bound queue %d to RSS bucket %d\n", i, cpu_id); #else device_printf(dev, "Bound queue %d to cpu %d\n", i, cpu_id); #endif } #ifndef IGB_LEGACY_TX TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start, que->txr); #endif /* Make tasklet for deferred handling */ TASK_INIT(&que->que_task, 0, igb_handle_que, que); que->tq = taskqueue_create("igb_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); if (adapter->num_queues > 1) { /* * Only pin the taskqueue thread to a CPU if * RSS is in use. * * This again just happens to match the default RSS * round-robin bucket -> queue -> CPU allocation. */ #ifdef RSS taskqueue_start_threads_pinned(&que->tq, 1, PI_NET, cpu_id, "%s que (bucket %d)", device_get_nameunit(adapter->dev), cpu_id); #else taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que (qid %d)", device_get_nameunit(adapter->dev), cpu_id); #endif } else { taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); } /* Finally update the last bound CPU id */ if (adapter->num_queues > 1) igb_last_bind_cpu = CPU_NEXT(igb_last_bind_cpu); } /* And Link */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "MSIX Link Interrupt\n"); return (ENXIO); } if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, igb_msix_link, adapter, &adapter->tag)) != 0) { device_printf(dev, "Failed to register Link handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; return (0); } static void igb_configure_queues(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct igb_queue *que; u32 tmp, ivar = 0, newitr = 0; /* First turn on RSS capability */ if (adapter->hw.mac.type != e1000_82575) E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE | E1000_GPIE_EIAME | E1000_GPIE_PBA | E1000_GPIE_NSICR); /* Turn on MSIX */ switch (adapter->hw.mac.type) { case e1000_82580: case e1000_i350: case e1000_i354: case e1000_i210: case e1000_i211: case e1000_vfadapt: case e1000_vfadapt_i350: /* RX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i >> 1; ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i & 1) { ivar &= 0xFF00FFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 16; } else { ivar &= 0xFFFFFF00; ivar |= que->msix | E1000_IVAR_VALID; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); } /* TX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i >> 1; ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i & 1) { ivar &= 0x00FFFFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 24; } else { ivar &= 0xFFFF00FF; ivar |= (que->msix | E1000_IVAR_VALID) << 8; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; case e1000_82576: /* RX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i & 0x7; /* Each IVAR has two entries */ ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i < 8) { ivar &= 0xFFFFFF00; ivar |= que->msix | E1000_IVAR_VALID; } else { ivar &= 0xFF00FFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 16; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* TX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i & 0x7; /* Each IVAR has two entries */ ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i < 8) { ivar &= 0xFFFF00FF; ivar |= (que->msix | E1000_IVAR_VALID) << 8; } else { ivar &= 0x00FFFFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 24; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; case e1000_82575: /* enable MSI-X support*/ tmp = E1000_READ_REG(hw, E1000_CTRL_EXT); tmp |= E1000_CTRL_EXT_PBA_CLR; /* Auto-Mask interrupts upon ICR read. */ tmp |= E1000_CTRL_EXT_EIAME; tmp |= E1000_CTRL_EXT_IRCA; E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmp); /* Queues */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; tmp = E1000_EICR_RX_QUEUE0 << i; tmp |= E1000_EICR_TX_QUEUE0 << i; que->eims = tmp; E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), i, que->eims); adapter->que_mask |= que->eims; } /* Link */ E1000_WRITE_REG(hw, E1000_MSIXBM(adapter->linkvec), E1000_EIMS_OTHER); adapter->link_mask |= E1000_EIMS_OTHER; default: break; } /* Set the starting interrupt rate */ if (igb_max_interrupt_rate > 0) newitr = (4000000 / igb_max_interrupt_rate) & 0x7FFC; if (hw->mac.type == e1000_82575) newitr |= newitr << 16; else newitr |= E1000_EITR_CNT_IGNR; for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; E1000_WRITE_REG(hw, E1000_EITR(que->msix), newitr); } return; } static void igb_free_pci_resources(struct adapter *adapter) { struct igb_queue *que = adapter->queues; device_t dev = adapter->dev; int rid; /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* * First release all the interrupt resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->linkvec) /* we are doing MSIX */ rid = adapter->linkvec + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); que = adapter->queues; if (adapter->tag != NULL) { taskqueue_drain(que->tq, &adapter->link_task); bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); for (int i = 0; i < adapter->num_queues; i++, que++) { if (que->tq != NULL) { #ifndef IGB_LEGACY_TX taskqueue_drain(que->tq, &que->txr->txq_task); #endif taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, adapter->memrid, adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); } /* * Setup Either MSI/X or MSI */ static int igb_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int bar, want, queues, msgs, maxqueues; int n_queues; /* tuneable override */ if (igb_enable_msix == 0) goto msi; /* First try MSI/X */ msgs = pci_msix_count(dev); if (msgs == 0) goto msi; /* ** Some new devices, as with ixgbe, now may ** use a different BAR, so we need to keep ** track of which is used. */ adapter->memrid = PCIR_BAR(IGB_MSIX_BAR); bar = pci_read_config(dev, adapter->memrid, 4); if (bar == 0) /* use next bar */ adapter->memrid += 4; adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &adapter->memrid, RF_ACTIVE); if (adapter->msix_mem == NULL) { /* May not be enabled */ device_printf(adapter->dev, "Unable to map MSIX table \n"); goto msi; } n_queues = 0; /* try more specific tunable, then global, then finally default to boot time tunable if set. */ if (device_getenv_int(dev, "num_queues", &n_queues) != 0) { device_printf(dev, "using specific tunable num_queues=%d", n_queues); } else if (TUNABLE_INT_FETCH("hw.igb.num_queues", &n_queues) != 0) { if (igb_num_queues != n_queues) { device_printf(dev, "using global tunable hw.igb.num_queues=%d", n_queues); igb_num_queues = n_queues; } } else { n_queues = igb_num_queues; } #ifdef RSS /* If we're doing RSS, clamp at the number of RSS buckets */ if (queues > rss_getnumbuckets()) queues = rss_getnumbuckets(); #endif if (n_queues != 0) { queues = n_queues; } else { /* Figure out a reasonable auto config value */ queues = (mp_ncpus > (msgs-1)) ? (msgs-1) : mp_ncpus; } /* Sanity check based on HW */ switch (adapter->hw.mac.type) { case e1000_82575: maxqueues = 4; break; case e1000_82576: case e1000_82580: case e1000_i350: case e1000_i354: maxqueues = 8; break; case e1000_i210: maxqueues = 4; break; case e1000_i211: maxqueues = 2; break; default: /* VF interfaces */ maxqueues = 1; break; } if (queues > maxqueues) { device_printf(adapter->dev, "requested %d queues, but max for this adapter is %d\n", queues, maxqueues); queues = maxqueues; } else if (queues == 0) { queues = 1; } else if (queues < 0) { device_printf(adapter->dev, "requested %d queues, but min for this adapter is %d\n", queues, 1); queues = 1; } /* ** One vector (RX/TX pair) per queue ** plus an additional for Link interrupt */ want = queues + 1; if (msgs >= want) msgs = want; else { device_printf(adapter->dev, "MSIX Configuration Problem, " "%d vectors configured, but %d queues wanted!\n", msgs, want); goto msi; } if ((pci_alloc_msix(dev, &msgs) == 0) && (msgs == want)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", msgs); adapter->num_queues = queues; return (msgs); } /* ** If MSIX alloc failed or provided us with ** less than needed, free and fall through to MSI */ pci_release_msi(dev); msi: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(IGB_MSIX_BAR), adapter->msix_mem); adapter->msix_mem = NULL; } msgs = 1; if (pci_alloc_msi(dev, &msgs) == 0) { device_printf(adapter->dev," Using an MSI interrupt\n"); return (msgs); } device_printf(adapter->dev," Using a Legacy interrupt\n"); return (0); } /********************************************************************* * * Initialize the DMA Coalescing feature * **********************************************************************/ static void igb_init_dmac(struct adapter *adapter, u32 pba) { device_t dev = adapter->dev; struct e1000_hw *hw = &adapter->hw; u32 dmac, reg = ~E1000_DMACR_DMAC_EN; u16 hwm; if (hw->mac.type == e1000_i211) return; if (hw->mac.type > e1000_82580) { if (adapter->dmac == 0) { /* Disabling it */ E1000_WRITE_REG(hw, E1000_DMACR, reg); return; } else device_printf(dev, "DMA Coalescing enabled\n"); /* Set starting threshold */ E1000_WRITE_REG(hw, E1000_DMCTXTH, 0); hwm = 64 * pba - adapter->max_frame_size / 16; if (hwm < 64 * (pba - 6)) hwm = 64 * (pba - 6); reg = E1000_READ_REG(hw, E1000_FCRTC); reg &= ~E1000_FCRTC_RTH_COAL_MASK; reg |= ((hwm << E1000_FCRTC_RTH_COAL_SHIFT) & E1000_FCRTC_RTH_COAL_MASK); E1000_WRITE_REG(hw, E1000_FCRTC, reg); dmac = pba - adapter->max_frame_size / 512; if (dmac < pba - 10) dmac = pba - 10; reg = E1000_READ_REG(hw, E1000_DMACR); reg &= ~E1000_DMACR_DMACTHR_MASK; reg = ((dmac << E1000_DMACR_DMACTHR_SHIFT) & E1000_DMACR_DMACTHR_MASK); /* transition to L0x or L1 if available..*/ reg |= (E1000_DMACR_DMAC_EN | E1000_DMACR_DMAC_LX_MASK); /* Check if status is 2.5Gb backplane connection * before configuration of watchdog timer, which is * in msec values in 12.8usec intervals * watchdog timer= msec values in 32usec intervals * for non 2.5Gb connection */ if (hw->mac.type == e1000_i354) { int status = E1000_READ_REG(hw, E1000_STATUS); if ((status & E1000_STATUS_2P5_SKU) && (!(status & E1000_STATUS_2P5_SKU_OVER))) reg |= ((adapter->dmac * 5) >> 6); else reg |= (adapter->dmac >> 5); } else { reg |= (adapter->dmac >> 5); } E1000_WRITE_REG(hw, E1000_DMACR, reg); #ifdef I210_OBFF_SUPPORT /* * Set the OBFF Rx threshold to DMA Coalescing Rx * threshold - 2KB and enable the feature in the * hardware for I210. */ if (hw->mac.type == e1000_i210) { int obff = dmac - 2; reg = E1000_READ_REG(hw, E1000_DOBFFCTL); reg &= ~E1000_DOBFFCTL_OBFFTHR_MASK; reg |= (obff & E1000_DOBFFCTL_OBFFTHR_MASK) | E1000_DOBFFCTL_EXIT_ACT_MASK; E1000_WRITE_REG(hw, E1000_DOBFFCTL, reg); } #endif E1000_WRITE_REG(hw, E1000_DMCRTRH, 0); /* Set the interval before transition */ reg = E1000_READ_REG(hw, E1000_DMCTLX); if (hw->mac.type == e1000_i350) reg |= IGB_DMCTLX_DCFLUSH_DIS; /* ** in 2.5Gb connection, TTLX unit is 0.4 usec ** which is 0x4*2 = 0xA. But delay is still 4 usec */ if (hw->mac.type == e1000_i354) { int status = E1000_READ_REG(hw, E1000_STATUS); if ((status & E1000_STATUS_2P5_SKU) && (!(status & E1000_STATUS_2P5_SKU_OVER))) reg |= 0xA; else reg |= 0x4; } else { reg |= 0x4; } E1000_WRITE_REG(hw, E1000_DMCTLX, reg); /* free space in tx packet buffer to wake from DMA coal */ E1000_WRITE_REG(hw, E1000_DMCTXTH, (IGB_TXPBSIZE - (2 * adapter->max_frame_size)) >> 6); /* make low power state decision controlled by DMA coal */ reg = E1000_READ_REG(hw, E1000_PCIEMISC); reg &= ~E1000_PCIEMISC_LX_DECISION; E1000_WRITE_REG(hw, E1000_PCIEMISC, reg); } else if (hw->mac.type == e1000_82580) { u32 reg = E1000_READ_REG(hw, E1000_PCIEMISC); E1000_WRITE_REG(hw, E1000_PCIEMISC, reg & ~E1000_PCIEMISC_LX_DECISION); E1000_WRITE_REG(hw, E1000_DMACR, 0); } } /********************************************************************* * * Set up an fresh starting state * **********************************************************************/ static void igb_reset(struct adapter *adapter) { device_t dev = adapter->dev; struct e1000_hw *hw = &adapter->hw; struct e1000_fc_info *fc = &hw->fc; struct ifnet *ifp = adapter->ifp; u32 pba = 0; u16 hwm; INIT_DEBUGOUT("igb_reset: begin"); /* Let the firmware know the OS is in control */ igb_get_hw_control(adapter); /* * Packet Buffer Allocation (PBA) * Writing PBA sets the receive portion of the buffer * the remainder is used for the transmit buffer. */ switch (hw->mac.type) { case e1000_82575: pba = E1000_PBA_32K; break; case e1000_82576: case e1000_vfadapt: pba = E1000_READ_REG(hw, E1000_RXPBS); pba &= E1000_RXPBS_SIZE_MASK_82576; break; case e1000_82580: case e1000_i350: case e1000_i354: case e1000_vfadapt_i350: pba = E1000_READ_REG(hw, E1000_RXPBS); pba = e1000_rxpbs_adjust_82580(pba); break; case e1000_i210: case e1000_i211: pba = E1000_PBA_34K; default: break; } /* Special needs in case of Jumbo frames */ if ((hw->mac.type == e1000_82575) && (ifp->if_mtu > ETHERMTU)) { u32 tx_space, min_tx, min_rx; pba = E1000_READ_REG(hw, E1000_PBA); tx_space = pba >> 16; pba &= 0xffff; min_tx = (adapter->max_frame_size + sizeof(struct e1000_tx_desc) - ETHERNET_FCS_SIZE) * 2; min_tx = roundup2(min_tx, 1024); min_tx >>= 10; min_rx = adapter->max_frame_size; min_rx = roundup2(min_rx, 1024); min_rx >>= 10; if (tx_space < min_tx && ((min_tx - tx_space) < pba)) { pba = pba - (min_tx - tx_space); /* * if short on rx space, rx wins * and must trump tx adjustment */ if (pba < min_rx) pba = min_rx; } E1000_WRITE_REG(hw, E1000_PBA, pba); } INIT_DEBUGOUT1("igb_init: pba=%dK",pba); /* * These parameters control the automatic generation (Tx) and * response (Rx) to Ethernet PAUSE frames. * - High water mark should allow for at least two frames to be * received after sending an XOFF. * - Low water mark works best when it is very near the high water mark. * This allows the receiver to restart by sending XON when it has * drained a bit. */ hwm = min(((pba << 10) * 9 / 10), ((pba << 10) - 2 * adapter->max_frame_size)); if (hw->mac.type < e1000_82576) { fc->high_water = hwm & 0xFFF8; /* 8-byte granularity */ fc->low_water = fc->high_water - 8; } else { fc->high_water = hwm & 0xFFF0; /* 16-byte granularity */ fc->low_water = fc->high_water - 16; } fc->pause_time = IGB_FC_PAUSE_TIME; fc->send_xon = TRUE; if (adapter->fc) fc->requested_mode = adapter->fc; else fc->requested_mode = e1000_fc_default; /* Issue a global reset */ e1000_reset_hw(hw); E1000_WRITE_REG(hw, E1000_WUC, 0); /* Reset for AutoMediaDetect */ if (adapter->flags & IGB_MEDIA_RESET) { e1000_setup_init_funcs(hw, TRUE); e1000_get_bus_info(hw); adapter->flags &= ~IGB_MEDIA_RESET; } if (e1000_init_hw(hw) < 0) device_printf(dev, "Hardware Initialization Failed\n"); /* Setup DMA Coalescing */ igb_init_dmac(adapter, pba); E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); e1000_get_phy_info(hw); e1000_check_for_link(hw); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int igb_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("igb_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_init = igb_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = igb_ioctl; ifp->if_get_counter = igb_get_counter; #ifndef IGB_LEGACY_TX ifp->if_transmit = igb_mq_start; ifp->if_qflush = igb_qflush; #else ifp->if_start = igb_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1; IFQ_SET_READY(&ifp->if_snd); #endif ether_ifattach(ifp, adapter->hw.mac.addr); ifp->if_capabilities = ifp->if_capenable = 0; ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_TSO; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capenable = ifp->if_capabilities; /* Don't enable LRO by default */ ifp->if_capabilities |= IFCAP_LRO; #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif /* * Tell the upper layer(s) we * support full VLAN capability. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; /* ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the igb driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, igb_media_change, igb_media_status); if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX, 0, NULL); } else { ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL); if (adapter->hw.phy.type != e1000_phy_ife) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } /* * Manage DMA'able memory. */ static void igb_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs[0].ds_addr; } static int igb_dma_malloc(struct adapter *adapter, bus_size_t size, struct igb_dma_alloc *dma, int mapflags) { int error; error = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ IGB_DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->dma_tag); if (error) { device_printf(adapter->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, error); goto fail_0; } error = bus_dmamem_alloc(dma->dma_tag, (void**) &dma->dma_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT, &dma->dma_map); if (error) { device_printf(adapter->dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, error); goto fail_2; } dma->dma_paddr = 0; error = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, igb_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (error || dma->dma_paddr == 0) { device_printf(adapter->dev, "%s: bus_dmamap_load failed: %d\n", __func__, error); goto fail_3; } return (0); fail_3: bus_dmamap_unload(dma->dma_tag, dma->dma_map); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_tag = NULL; return (error); } static void igb_dma_free(struct adapter *adapter, struct igb_dma_alloc *dma) { if (dma->dma_tag == NULL) return; if (dma->dma_paddr != 0) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); dma->dma_paddr = 0; } if (dma->dma_vaddr != NULL) { bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); dma->dma_vaddr = NULL; } bus_dma_tag_destroy(dma->dma_tag); dma->dma_tag = NULL; } /********************************************************************* * * Allocate memory for the transmit and receive rings, and then * the descriptors associated with each, called only once at attach. * **********************************************************************/ static int igb_allocate_queues(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = NULL; struct tx_ring *txr = NULL; struct rx_ring *rxr = NULL; int rsize, tsize, error = E1000_SUCCESS; int txconf = 0, rxconf = 0; /* First allocate the top level queue structs */ if (!(adapter->queues = (struct igb_queue *) malloc(sizeof(struct igb_queue) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate queue memory\n"); error = ENOMEM; goto fail; } /* Next allocate the TX ring struct memory */ if (!(adapter->tx_rings = (struct tx_ring *) malloc(sizeof(struct tx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); error = ENOMEM; goto tx_fail; } /* Now allocate the RX */ if (!(adapter->rx_rings = (struct rx_ring *) malloc(sizeof(struct rx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); error = ENOMEM; goto rx_fail; } tsize = roundup2(adapter->num_tx_desc * sizeof(union e1000_adv_tx_desc), IGB_DBA_ALIGN); /* * Now set up the TX queues, txconf is needed to handle the * possibility that things fail midcourse and we need to * undo memory gracefully */ for (int i = 0; i < adapter->num_queues; i++, txconf++) { /* Set up some basics */ txr = &adapter->tx_rings[i]; txr->adapter = adapter; txr->me = i; txr->num_desc = adapter->num_tx_desc; /* Initialize the TX lock */ snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF); if (igb_dma_malloc(adapter, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate TX Descriptor memory\n"); error = ENOMEM; goto err_tx_desc; } txr->tx_base = (union e1000_adv_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); /* Now allocate transmit buffers for the ring */ if (igb_allocate_transmit_buffers(txr)) { device_printf(dev, "Critical Failure setting up transmit buffers\n"); error = ENOMEM; goto err_tx_desc; } #ifndef IGB_LEGACY_TX /* Allocate a buf ring */ txr->br = buf_ring_alloc(igb_buf_ring_size, M_DEVBUF, M_WAITOK, &txr->tx_mtx); #endif } /* * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * sizeof(union e1000_adv_rx_desc), IGB_DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; rxr->adapter = adapter; rxr->me = i; /* Initialize the RX lock */ snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF); if (igb_dma_malloc(adapter, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate RxDescriptor memory\n"); error = ENOMEM; goto err_rx_desc; } rxr->rx_base = (union e1000_adv_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ if (igb_allocate_receive_buffers(rxr)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); error = ENOMEM; goto err_rx_desc; } } /* ** Finally set up the queue holding structs */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; que->adapter = adapter; que->txr = &adapter->tx_rings[i]; que->rxr = &adapter->rx_rings[i]; } return (0); err_rx_desc: for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--) igb_dma_free(adapter, &rxr->rxdma); err_tx_desc: for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--) igb_dma_free(adapter, &txr->txdma); free(adapter->rx_rings, M_DEVBUF); rx_fail: #ifndef IGB_LEGACY_TX buf_ring_free(txr->br, M_DEVBUF); #endif free(adapter->tx_rings, M_DEVBUF); tx_fail: free(adapter->queues, M_DEVBUF); fail: return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int igb_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; struct igb_tx_buf *txbuf; int error, i; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IGB_TSO_SIZE, /* maxsize */ IGB_MAX_SCATTER, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->txtag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } if (!(txr->tx_buffers = (struct igb_tx_buf *) malloc(sizeof(struct igb_tx_buf) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { error = bus_dmamap_create(txr->txtag, 0, &txbuf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } return 0; fail: /* We free all, it handles case where we are in the middle */ igb_free_transmit_structures(adapter); return (error); } /********************************************************************* * * Initialize a transmit ring. * **********************************************************************/ static void igb_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct igb_tx_buf *txbuf; int i; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ /* Clear the old descriptor contents */ IGB_TX_LOCK(txr); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_TX, txr->me, 0); #endif /* DEV_NETMAP */ bzero((void *)txr->tx_base, (sizeof(union e1000_adv_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ txr->next_avail_desc = 0; txr->next_to_clean = 0; /* Free any existing tx buffers. */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; } #ifdef DEV_NETMAP if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); /* no need to set the address */ netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si)); } #endif /* DEV_NETMAP */ /* clear the watch index */ txbuf->eop = NULL; } /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IGB_TX_UNLOCK(txr); } /********************************************************************* * * Initialize all transmit rings. * **********************************************************************/ static void igb_setup_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) igb_setup_transmit_ring(txr); return; } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void igb_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct e1000_hw *hw = &adapter->hw; u32 tctl, txdctl; INIT_DEBUGOUT("igb_initialize_transmit_units: begin"); tctl = txdctl = 0; /* Setup the Tx Descriptor Rings */ for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 bus_addr = txr->txdma.dma_paddr; E1000_WRITE_REG(hw, E1000_TDLEN(i), adapter->num_tx_desc * sizeof(struct e1000_tx_desc)); E1000_WRITE_REG(hw, E1000_TDBAH(i), (uint32_t)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr); /* Setup the HW Tx Head and Tail descriptor pointers */ E1000_WRITE_REG(hw, E1000_TDT(i), 0); E1000_WRITE_REG(hw, E1000_TDH(i), 0); HW_DEBUGOUT2("Base = %x, Length = %x\n", E1000_READ_REG(hw, E1000_TDBAL(i)), E1000_READ_REG(hw, E1000_TDLEN(i))); txr->queue_status = IGB_QUEUE_IDLE; txdctl |= IGB_TX_PTHRESH; txdctl |= IGB_TX_HTHRESH << 8; txdctl |= IGB_TX_WTHRESH << 16; txdctl |= E1000_TXDCTL_QUEUE_ENABLE; E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl); } if (adapter->vf_ifp) return; e1000_config_collision_dist(hw); /* Program the Transmit Control Register */ tctl = E1000_READ_REG(hw, E1000_TCTL); tctl &= ~E1000_TCTL_CT; tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN | (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT)); /* This write will effectively turn on the transmit unit. */ E1000_WRITE_REG(hw, E1000_TCTL, tctl); } /********************************************************************* * * Free all transmit rings. * **********************************************************************/ static void igb_free_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); igb_free_transmit_buffers(txr); igb_dma_free(adapter, &txr->txdma); IGB_TX_UNLOCK(txr); IGB_TX_LOCK_DESTROY(txr); } free(adapter->tx_rings, M_DEVBUF); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ static void igb_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct igb_tx_buf *tx_buffer; int i; INIT_DEBUGOUT("free_transmit_ring: begin"); if (txr->tx_buffers == NULL) return; tx_buffer = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) { if (tx_buffer->m_head != NULL) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; if (tx_buffer->map != NULL) { bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } else if (tx_buffer->map != NULL) { bus_dmamap_unload(txr->txtag, tx_buffer->map); bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } #ifndef IGB_LEGACY_TX if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); #endif if (txr->tx_buffers != NULL) { free(txr->tx_buffers, M_DEVBUF); txr->tx_buffers = NULL; } if (txr->txtag != NULL) { bus_dma_tag_destroy(txr->txtag); txr->txtag = NULL; } return; } /********************************************************************** * * Setup work for hardware segmentation offload (TSO) on * adapters using advanced tx descriptors * **********************************************************************/ static int igb_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct adapter *adapter = txr->adapter; struct e1000_adv_tx_context_desc *TXD; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; u32 mss_l4len_idx = 0, paylen; u16 vtag = 0, eh_type; int ctxd, ehdrlen, ip_hlen, tcp_hlen; struct ether_vlan_header *eh; #ifdef INET6 struct ip6_hdr *ip6; #endif #ifdef INET struct ip *ip; #endif struct tcphdr *th; /* * Determine where frame payload starts. * Jump over vlan headers if already present */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; eh_type = eh->evl_proto; } else { ehdrlen = ETHER_HDR_LEN; eh_type = eh->evl_encap_proto; } switch (ntohs(eh_type)) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); /* XXX-BZ For now we do not pretend to support ext. hdrs. */ if (ip6->ip6_nxt != IPPROTO_TCP) return (ENXIO); ip_hlen = sizeof(struct ip6_hdr); ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6; break; #endif #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); if (ip->ip_p != IPPROTO_TCP) return (ENXIO); ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; /* Tell transmit desc to also do IPv4 checksum. */ *olinfo_status |= E1000_TXD_POPTS_IXSM << 8; break; #endif default: panic("%s: CSUM_TSO but no supported IP version (0x%04x)", __func__, ntohs(eh_type)); break; } ctxd = txr->next_avail_desc; TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[ctxd]; tcp_hlen = th->th_off << 2; /* This is used in the transmit desc in encap */ paylen = mp->m_pkthdr.len - ehdrlen - ip_hlen - tcp_hlen; /* VLAN MACLEN IPLEN */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << E1000_ADVTXD_VLAN_SHIFT); } vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= ip_hlen; TXD->vlan_macip_lens = htole32(vlan_macip_lens); /* ADV DTYPE TUCMD */ type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); /* MSS L4LEN IDX */ mss_l4len_idx |= (mp->m_pkthdr.tso_segsz << E1000_ADVTXD_MSS_SHIFT); mss_l4len_idx |= (tcp_hlen << E1000_ADVTXD_L4LEN_SHIFT); /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) mss_l4len_idx |= txr->me << 4; TXD->mss_l4len_idx = htole32(mss_l4len_idx); TXD->seqnum_seed = htole32(0); if (++ctxd == txr->num_desc) ctxd = 0; txr->tx_avail--; txr->next_avail_desc = ctxd; *cmd_type_len |= E1000_ADVTXD_DCMD_TSE; *olinfo_status |= E1000_TXD_POPTS_TXSM << 8; *olinfo_status |= paylen << E1000_ADVTXD_PAYLEN_SHIFT; ++txr->tso_tx; return (0); } /********************************************************************* * * Advanced Context Descriptor setup for VLAN, CSUM or TSO * **********************************************************************/ static int igb_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct e1000_adv_tx_context_desc *TXD; struct adapter *adapter = txr->adapter; struct ether_vlan_header *eh; struct ip *ip; struct ip6_hdr *ip6; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0, mss_l4len_idx = 0; int ehdrlen, ip_hlen = 0; u16 etype; u8 ipproto = 0; int offload = TRUE; int ctxd = txr->next_avail_desc; u16 vtag = 0; /* First check if TSO is to be used */ if (mp->m_pkthdr.csum_flags & CSUM_TSO) return (igb_tso_setup(txr, mp, cmd_type_len, olinfo_status)); if ((mp->m_pkthdr.csum_flags & CSUM_OFFLOAD) == 0) offload = FALSE; /* Indicate the whole packet as payload when not doing TSO */ *olinfo_status |= mp->m_pkthdr.len << E1000_ADVTXD_PAYLEN_SHIFT; /* Now ready a context descriptor */ TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[ctxd]; /* ** In advanced descriptors the vlan tag must ** be placed into the context descriptor. Hence ** we need to make one even if not doing offloads. */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << E1000_ADVTXD_VLAN_SHIFT); } else if (offload == FALSE) /* ... no offload to do */ return (0); /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } /* Set the ether header length */ vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT; switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; ipproto = ip->ip_p; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); /* XXX-BZ this will go badly in case of ext hdrs. */ ipproto = ip6->ip6_nxt; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6; break; default: offload = FALSE; break; } vlan_macip_lens |= ip_hlen; type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; switch (ipproto) { case IPPROTO_TCP: if (mp->m_pkthdr.csum_flags & CSUM_TCP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & CSUM_UDP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP; break; #if __FreeBSD_version >= 800000 case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & CSUM_SCTP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP; break; #endif default: offload = FALSE; break; } if (offload) /* For the TX descriptor setup */ *olinfo_status |= E1000_TXD_POPTS_TXSM << 8; /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) mss_l4len_idx = txr->me << 4; /* Now copy bits into descriptor */ TXD->vlan_macip_lens = htole32(vlan_macip_lens); TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); TXD->seqnum_seed = htole32(0); TXD->mss_l4len_idx = htole32(mss_l4len_idx); /* We've consumed the first desc, adjust counters */ if (++ctxd == txr->num_desc) ctxd = 0; txr->next_avail_desc = ctxd; --txr->tx_avail; return (0); } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * * TRUE return means there's work in the ring to clean, FALSE its empty. **********************************************************************/ static bool igb_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; u32 work, processed = 0; u16 limit = txr->process_limit; struct igb_tx_buf *buf; union e1000_adv_tx_desc *txd; mtx_assert(&txr->tx_mtx, MA_OWNED); #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, txr->me)) return (FALSE); #endif /* DEV_NETMAP */ if (txr->tx_avail == txr->num_desc) { txr->queue_status = IGB_QUEUE_IDLE; return FALSE; } /* Get work starting point */ work = txr->next_to_clean; buf = &txr->tx_buffers[work]; txd = &txr->tx_base[work]; work -= txr->num_desc; /* The distance to ring end */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); do { union e1000_adv_tx_desc *eop = buf->eop; if (eop == NULL) /* No work */ break; if ((eop->wb.status & E1000_TXD_STAT_DD) == 0) break; /* I/O not complete */ if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; } buf->eop = NULL; ++txr->tx_avail; /* We clean the range if multi segment */ while (txd != eop) { ++txd; ++buf; ++work; /* wrap the ring? */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; } ++txr->tx_avail; buf->eop = NULL; } ++txr->packets; ++processed; if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); txr->watchdog_time = ticks; /* Try the next packet */ ++txd; ++buf; ++work; /* reset with a wrap */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } prefetch(txd); } while (__predict_true(--limit)); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); work += txr->num_desc; txr->next_to_clean = work; /* ** Watchdog calculation, we know there's ** work outstanding or the first return ** would have been taken, so none processed ** for too long indicates a hang. */ if ((!processed) && ((ticks - txr->watchdog_time) > IGB_WATCHDOG)) txr->queue_status |= IGB_QUEUE_HUNG; if (txr->tx_avail >= IGB_QUEUE_THRESHOLD) txr->queue_status &= ~IGB_QUEUE_DEPLETED; if (txr->tx_avail == txr->num_desc) { txr->queue_status = IGB_QUEUE_IDLE; return (FALSE); } return (TRUE); } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void igb_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; bus_dma_segment_t hseg[1]; bus_dma_segment_t pseg[1]; struct igb_rx_buf *rxbuf; struct mbuf *mh, *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_to_refresh; /* ** Get one descriptor beyond ** our work mark to control ** the loop. */ if (++j == adapter->num_rx_desc) j = 0; while (j != limit) { rxbuf = &rxr->rx_buffers[i]; /* No hdr mbuf used with header split off */ if (rxr->hdr_split == FALSE) goto no_split; if (rxbuf->m_head == NULL) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) goto update; } else mh = rxbuf->m_head; mh->m_pkthdr.len = mh->m_len = MHLEN; mh->m_len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: hdr dmamap load" " failure - %d\n", error); m_free(mh); rxbuf->m_head = NULL; goto update; } rxbuf->m_head = mh; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.hdr_addr = htole64(hseg[0].ds_addr); no_split: if (rxbuf->m_pack == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (mp == NULL) goto update; } else mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: payload dmamap load" " failure - %d\n", error); m_free(mp); rxbuf->m_pack = NULL; goto update; } rxbuf->m_pack = mp; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.pkt_addr = htole64(pseg[0].ds_addr); refreshed = TRUE; /* I feel wefreshed :) */ i = j; /* our next is precalculated */ rxr->next_to_refresh = i; if (++j == adapter->num_rx_desc) j = 0; } update: if (refreshed) /* update tail */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), rxr->next_to_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int igb_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; struct igb_rx_buf *rxbuf; int i, bsize, error; bsize = sizeof(struct igb_rx_buf) * adapter->num_rx_desc; if (!(rxr->rx_buffers = (struct igb_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MSIZE, /* maxsize */ 1, /* nsegments */ MSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->htag))) { device_printf(dev, "Unable to create RX DMA tag\n"); goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX payload DMA tag\n"); goto fail; } for (i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; error = bus_dmamap_create(rxr->htag, 0, &rxbuf->hmap); if (error) { device_printf(dev, "Unable to create RX head DMA maps\n"); goto fail; } error = bus_dmamap_create(rxr->ptag, 0, &rxbuf->pmap); if (error) { device_printf(dev, "Unable to create RX packet DMA maps\n"); goto fail; } } return (0); fail: /* Frees all, but can handle partial completion */ igb_free_receive_structures(adapter); return (error); } static void igb_free_receive_ring(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct igb_rx_buf *rxbuf; for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; } } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int igb_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter; struct ifnet *ifp; device_t dev; struct igb_rx_buf *rxbuf; bus_dma_segment_t pseg[1], hseg[1]; struct lro_ctrl *lro = &rxr->lro; int rsize, nsegs, error = 0; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(rxr->adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ adapter = rxr->adapter; dev = adapter->dev; ifp = adapter->ifp; /* Clear the ring contents */ IGB_RX_LOCK(rxr); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_RX, rxr->me, 0); #endif /* DEV_NETMAP */ rsize = roundup2(adapter->num_rx_desc * sizeof(union e1000_adv_rx_desc), IGB_DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); /* ** Free current RX buffer structures and their mbufs */ igb_free_receive_ring(rxr); /* Configure for header split? */ if (igb_header_split) rxr->hdr_split = TRUE; /* Now replenish the ring mbufs */ for (int j = 0; j < adapter->num_rx_desc; ++j) { struct mbuf *mh, *mp; rxbuf = &rxr->rx_buffers[j]; #ifdef DEV_NETMAP if (slot) { /* slot sj is mapped to the j-th NIC-ring entry */ int sj = netmap_idx_n2k(&na->rx_rings[rxr->me], j); uint64_t paddr; void *addr; addr = PNMB(na, slot + sj, &paddr); netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(paddr); continue; } #endif /* DEV_NETMAP */ if (rxr->hdr_split == FALSE) goto skip_head; /* First the header */ rxbuf->m_head = m_gethdr(M_NOWAIT, MT_DATA); if (rxbuf->m_head == NULL) { error = ENOBUFS; goto fail; } m_adj(rxbuf->m_head, ETHER_ALIGN); mh = rxbuf->m_head; mh->m_len = mh->m_pkthdr.len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, rxbuf->m_head, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) /* Nothing elegant to do here */ goto fail; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.hdr_addr = htole64(hseg[0].ds_addr); skip_head: /* Now the payload cluster */ rxbuf->m_pack = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->m_pack == NULL) { error = ENOBUFS; goto fail; } mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(pseg[0].ds_addr); } /* Setup our descriptor indices */ rxr->next_to_check = 0; rxr->next_to_refresh = adapter->num_rx_desc - 1; rxr->lro_enabled = FALSE; rxr->rx_split_packets = 0; rxr->rx_bytes = 0; rxr->fmp = NULL; rxr->lmp = NULL; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* ** Now set up the LRO interface, we ** also only do head split when LRO ** is enabled, since so often they ** are undesireable in similar setups. */ if (ifp->if_capenable & IFCAP_LRO) { error = tcp_lro_init(lro); if (error) { device_printf(dev, "LRO Initialization failed!\n"); goto fail; } INIT_DEBUGOUT("RX LRO Initialized\n"); rxr->lro_enabled = TRUE; lro->ifp = adapter->ifp; } IGB_RX_UNLOCK(rxr); return (0); fail: igb_free_receive_ring(rxr); IGB_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int igb_setup_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; int i; for (i = 0; i < adapter->num_queues; i++, rxr++) if (igb_setup_receive_ring(rxr)) goto fail; return (0); fail: /* * Free RX buffers allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'i' is the endpoint. */ for (int j = 0; j < i; ++j) { rxr = &adapter->rx_rings[j]; IGB_RX_LOCK(rxr); igb_free_receive_ring(rxr); IGB_RX_UNLOCK(rxr); } return (ENOBUFS); } /* * Initialise the RSS mapping for NICs that support multiple transmit/ * receive rings. */ static void igb_initialise_rss_mapping(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; int i; int queue_id; u32 reta; u32 rss_key[10], mrqc, shift = 0; /* XXX? */ if (adapter->hw.mac.type == e1000_82575) shift = 6; /* * The redirection table controls which destination * queue each bucket redirects traffic to. * Each DWORD represents four queues, with the LSB * being the first queue in the DWORD. * * This just allocates buckets to queues using round-robin * allocation. * * NOTE: It Just Happens to line up with the default * RSS allocation method. */ /* Warning FM follows */ reta = 0; for (i = 0; i < 128; i++) { #ifdef RSS queue_id = rss_get_indirection_to_bucket(i); /* * If we have more queues than buckets, we'll * end up mapping buckets to a subset of the * queues. * * If we have more buckets than queues, we'll * end up instead assigning multiple buckets * to queues. * * Both are suboptimal, but we need to handle * the case so we don't go out of bounds * indexing arrays and such. */ queue_id = queue_id % adapter->num_queues; #else queue_id = (i % adapter->num_queues); #endif /* Adjust if required */ queue_id = queue_id << shift; /* * The low 8 bits are for hash value (n+0); * The next 8 bits are for hash value (n+1), etc. */ reta = reta >> 8; reta = reta | ( ((uint32_t) queue_id) << 24); if ((i & 3) == 3) { E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta); reta = 0; } } /* Now fill in hash table */ /* XXX This means RSS enable + 8 queues for my igb (82580.) */ mrqc = E1000_MRQC_ENABLE_RSS_4Q; #ifdef RSS /* XXX ew typecasting */ rss_getkey((uint8_t *) &rss_key); #else arc4rand(&rss_key, sizeof(rss_key), 0); #endif for (i = 0; i < 10; i++) E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key[i]); /* * Configure the RSS fields to hash upon. */ mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 | E1000_MRQC_RSS_FIELD_IPV4_TCP); mrqc |= (E1000_MRQC_RSS_FIELD_IPV6 | E1000_MRQC_RSS_FIELD_IPV6_TCP); mrqc |=( E1000_MRQC_RSS_FIELD_IPV4_UDP | E1000_MRQC_RSS_FIELD_IPV6_UDP); mrqc |=( E1000_MRQC_RSS_FIELD_IPV6_UDP_EX | E1000_MRQC_RSS_FIELD_IPV6_TCP_EX); E1000_WRITE_REG(hw, E1000_MRQC, mrqc); } /********************************************************************* * * Enable receive unit. * **********************************************************************/ static void igb_initialize_receive_units(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u32 rctl, rxcsum, psize, srrctl = 0; INIT_DEBUGOUT("igb_initialize_receive_unit: begin"); /* * Make sure receives are disabled while setting * up the descriptor ring */ rctl = E1000_READ_REG(hw, E1000_RCTL); E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN); /* ** Set up for header split */ if (igb_header_split) { /* Use a standard mbuf for the header */ srrctl |= IGB_HDR_BUF << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; } else srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF; /* ** Set up for jumbo frames */ if (ifp->if_mtu > ETHERMTU) { rctl |= E1000_RCTL_LPE; if (adapter->rx_mbuf_sz == MJUMPAGESIZE) { srrctl |= 4096 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX; } else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) { srrctl |= 8192 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX; } /* Set maximum packet len */ psize = adapter->max_frame_size; /* are we on a vlan? */ if (adapter->ifp->if_vlantrunk != NULL) psize += VLAN_TAG_SIZE; E1000_WRITE_REG(&adapter->hw, E1000_RLPML, psize); } else { rctl &= ~E1000_RCTL_LPE; srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_2048; } /* * If TX flow control is disabled and there's >1 queue defined, * enable DROP. * * This drops frames rather than hanging the RX MAC for all queues. */ if ((adapter->num_queues > 1) && (adapter->fc == e1000_fc_none || adapter->fc == e1000_fc_rx_pause)) { srrctl |= E1000_SRRCTL_DROP_EN; } /* Setup the Base and Length of the Rx Descriptor Rings */ for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 bus_addr = rxr->rxdma.dma_paddr; u32 rxdctl; E1000_WRITE_REG(hw, E1000_RDLEN(i), adapter->num_rx_desc * sizeof(struct e1000_rx_desc)); E1000_WRITE_REG(hw, E1000_RDBAH(i), (uint32_t)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr); E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl); /* Enable this Queue */ rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i)); rxdctl |= E1000_RXDCTL_QUEUE_ENABLE; rxdctl &= 0xFFF00000; rxdctl |= IGB_RX_PTHRESH; rxdctl |= IGB_RX_HTHRESH << 8; rxdctl |= IGB_RX_WTHRESH << 16; E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl); } /* ** Setup for RX MultiQueue */ rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); if (adapter->num_queues >1) { /* rss setup */ igb_initialise_rss_mapping(adapter); /* ** NOTE: Receive Full-Packet Checksum Offload ** is mutually exclusive with Multiqueue. However ** this is not the same as TCP/IP checksums which ** still work. */ rxcsum |= E1000_RXCSUM_PCSD; #if __FreeBSD_version >= 800000 /* For SCTP Offload */ if ((hw->mac.type == e1000_82576) && (ifp->if_capenable & IFCAP_RXCSUM)) rxcsum |= E1000_RXCSUM_CRCOFL; #endif } else { /* Non RSS setup */ if (ifp->if_capenable & IFCAP_RXCSUM) { rxcsum |= E1000_RXCSUM_IPPCSE; #if __FreeBSD_version >= 800000 if (adapter->hw.mac.type == e1000_82576) rxcsum |= E1000_RXCSUM_CRCOFL; #endif } else rxcsum &= ~E1000_RXCSUM_TUOFL; } E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); /* Setup the Receive Control Register */ rctl &= ~(3 << E1000_RCTL_MO_SHIFT); rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO | E1000_RCTL_RDMTS_HALF | (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT); /* Strip CRC bytes. */ rctl |= E1000_RCTL_SECRC; /* Make sure VLAN Filters are off */ rctl &= ~E1000_RCTL_VFE; /* Don't store bad packets */ rctl &= ~E1000_RCTL_SBP; /* Enable Receives */ E1000_WRITE_REG(hw, E1000_RCTL, rctl); /* * Setup the HW Rx Head and Tail Descriptor Pointers * - needs to be after enable */ for (int i = 0; i < adapter->num_queues; i++) { rxr = &adapter->rx_rings[i]; E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check); #ifdef DEV_NETMAP /* * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. * In this driver it means we adjust RDT to * something different from next_to_refresh * (which is not used in netmap mode). */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; int t = rxr->next_to_refresh - nm_kr_rxspace(kring); if (t >= adapter->num_rx_desc) t -= adapter->num_rx_desc; else if (t < 0) t += adapter->num_rx_desc; E1000_WRITE_REG(hw, E1000_RDT(i), t); } else #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rxr->next_to_refresh); } return; } /********************************************************************* * * Free receive rings. * **********************************************************************/ static void igb_free_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; for (int i = 0; i < adapter->num_queues; i++, rxr++) { struct lro_ctrl *lro = &rxr->lro; igb_free_receive_buffers(rxr); tcp_lro_free(lro); igb_dma_free(adapter, &rxr->rxdma); } free(adapter->rx_rings, M_DEVBUF); } /********************************************************************* * * Free receive ring data structures. * **********************************************************************/ static void igb_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct igb_rx_buf *rxbuf; int i; INIT_DEBUGOUT("free_receive_structures: begin"); /* Cleanup any existing buffers */ if (rxr->rx_buffers != NULL) { for (i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; if (rxbuf->hmap != NULL) { bus_dmamap_destroy(rxr->htag, rxbuf->hmap); rxbuf->hmap = NULL; } if (rxbuf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, rxbuf->pmap); rxbuf->pmap = NULL; } } if (rxr->rx_buffers != NULL) { free(rxr->rx_buffers, M_DEVBUF); rxr->rx_buffers = NULL; } } if (rxr->htag != NULL) { bus_dma_tag_destroy(rxr->htag); rxr->htag = NULL; } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } } static __inline void igb_rx_discard(struct rx_ring *rxr, int i) { struct igb_rx_buf *rbuf; rbuf = &rxr->rx_buffers[i]; /* Partially received? Free the chain */ if (rxr->fmp != NULL) { rxr->fmp->m_flags |= M_PKTHDR; m_freem(rxr->fmp); rxr->fmp = NULL; rxr->lmp = NULL; } /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->m_head) { m_free(rbuf->m_head); rbuf->m_head = NULL; bus_dmamap_unload(rxr->htag, rbuf->hmap); } if (rbuf->m_pack) { m_free(rbuf->m_pack); rbuf->m_pack = NULL; bus_dmamap_unload(rxr->ptag, rbuf->pmap); } return; } static __inline void igb_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u32 ptype) { /* * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & (E1000_RXDADV_PKTTYPE_IPV4 | E1000_RXDADV_PKTTYPE_TCP)) == (E1000_RXDADV_PKTTYPE_IPV4 | E1000_RXDADV_PKTTYPE_TCP) && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } IGB_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IGB_RX_LOCK(rxr); } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE if more to clean, FALSE otherwise *********************************************************************/ static bool igb_rxeof(struct igb_queue *que, int count, int *done) { struct adapter *adapter = que->adapter; struct rx_ring *rxr = que->rxr; struct ifnet *ifp = adapter->ifp; struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; int i, processed = 0, rxdone = 0; u32 ptype, staterr = 0; union e1000_adv_rx_desc *cur; IGB_RX_LOCK(rxr); /* Sync the ring. */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, rxr->me, &processed)) { IGB_RX_UNLOCK(rxr); return (FALSE); } #endif /* DEV_NETMAP */ /* Main clean loop */ for (i = rxr->next_to_check; count != 0;) { struct mbuf *sendmp, *mh, *mp; struct igb_rx_buf *rxbuf; u16 hlen, plen, hdr, vtag, pkt_info; bool eop = FALSE; cur = &rxr->rx_base[i]; staterr = le32toh(cur->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = mh = mp = NULL; cur->wb.upper.status_error = 0; rxbuf = &rxr->rx_buffers[i]; plen = le16toh(cur->wb.upper.length); ptype = le32toh(cur->wb.lower.lo_dword.data) & IGB_PKTTYPE_MASK; if (((adapter->hw.mac.type == e1000_i350) || (adapter->hw.mac.type == e1000_i354)) && (staterr & E1000_RXDEXT_STATERR_LB)) vtag = be16toh(cur->wb.upper.vlan); else vtag = le16toh(cur->wb.upper.vlan); hdr = le16toh(cur->wb.lower.lo_dword.hs_rss.hdr_info); pkt_info = le16toh(cur->wb.lower.lo_dword.hs_rss.pkt_info); eop = ((staterr & E1000_RXD_STAT_EOP) == E1000_RXD_STAT_EOP); /* * Free the frame (all segments) if we're at EOP and * it's an error. * * The datasheet states that EOP + status is only valid for * the final segment in a multi-segment frame. */ if (eop && ((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) != 0)) { adapter->dropped_pkts++; ++rxr->rx_discarded; igb_rx_discard(rxr, i); goto next_desc; } /* ** The way the hardware is configured to ** split, it will ONLY use the header buffer ** when header split is enabled, otherwise we ** get normal behavior, ie, both header and ** payload are DMA'd into the payload buffer. ** ** The fmp test is to catch the case where a ** packet spans multiple descriptors, in that ** case only the first header is valid. */ if (rxr->hdr_split && rxr->fmp == NULL) { bus_dmamap_unload(rxr->htag, rxbuf->hmap); hlen = (hdr & E1000_RXDADV_HDRBUFLEN_MASK) >> E1000_RXDADV_HDRBUFLEN_SHIFT; if (hlen > IGB_HDR_BUF) hlen = IGB_HDR_BUF; mh = rxr->rx_buffers[i].m_head; mh->m_len = hlen; /* clear buf pointer for refresh */ rxbuf->m_head = NULL; /* ** Get the payload length, this ** could be zero if its a small ** packet. */ if (plen > 0) { mp = rxr->rx_buffers[i].m_pack; mp->m_len = plen; mh->m_next = mp; /* clear buf pointer */ rxbuf->m_pack = NULL; rxr->rx_split_packets++; } } else { /* ** Either no header split, or a ** secondary piece of a fragmented ** split packet. */ mh = rxr->rx_buffers[i].m_pack; mh->m_len = plen; /* clear buf info for refresh */ rxbuf->m_pack = NULL; } bus_dmamap_unload(rxr->ptag, rxbuf->pmap); ++processed; /* So we know when to refresh */ /* Initial frame - setup */ if (rxr->fmp == NULL) { mh->m_pkthdr.len = mh->m_len; /* Save the head of the chain */ rxr->fmp = mh; rxr->lmp = mh; if (mp != NULL) { /* Add payload if split */ mh->m_pkthdr.len += mp->m_len; rxr->lmp = mh->m_next; } } else { /* Chain mbuf's together */ rxr->lmp->m_next = mh; rxr->lmp = rxr->lmp->m_next; rxr->fmp->m_pkthdr.len += mh->m_len; } if (eop) { rxr->fmp->m_pkthdr.rcvif = ifp; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); rxr->rx_packets++; /* capture data for AIM */ rxr->packets++; rxr->bytes += rxr->fmp->m_pkthdr.len; rxr->rx_bytes += rxr->fmp->m_pkthdr.len; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) igb_rx_checksum(staterr, rxr->fmp, ptype); if ((ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (staterr & E1000_RXD_STAT_VP) != 0) { rxr->fmp->m_pkthdr.ether_vtag = vtag; rxr->fmp->m_flags |= M_VLANTAG; } #ifdef RSS /* XXX set flowtype once this works right */ rxr->fmp->m_pkthdr.flowid = le32toh(cur->wb.lower.hi_dword.rss); - rxr->fmp->m_flags |= M_FLOWID; switch (pkt_info & E1000_RXDADV_RSSTYPE_MASK) { case E1000_RXDADV_RSSTYPE_IPV4_TCP: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV4); break; case E1000_RXDADV_RSSTYPE_IPV4: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV4); break; case E1000_RXDADV_RSSTYPE_IPV6_TCP: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV6); break; case E1000_RXDADV_RSSTYPE_IPV6_EX: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV6_EX); break; case E1000_RXDADV_RSSTYPE_IPV6: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV6); break; case E1000_RXDADV_RSSTYPE_IPV6_TCP_EX: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV6_EX); break; /* XXX no UDP support in RSS just yet */ #ifdef notyet case E1000_RXDADV_RSSTYPE_IPV4_UDP: case E1000_RXDADV_RSSTYPE_IPV6_UDP: case E1000_RXDADV_RSSTYPE_IPV6_UDP_EX: #endif default: /* XXX fallthrough */ - M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_NONE); + M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_OPAQUE); } #elif !defined(IGB_LEGACY_TX) rxr->fmp->m_pkthdr.flowid = que->msix; - rxr->fmp->m_flags |= M_FLOWID; + M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_OPAQUE); #endif sendmp = rxr->fmp; /* Make sure to set M_PKTHDR. */ sendmp->m_flags |= M_PKTHDR; rxr->fmp = NULL; rxr->lmp = NULL; } next_desc: bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == adapter->num_rx_desc) i = 0; /* ** Send to the stack or LRO */ if (sendmp != NULL) { rxr->next_to_check = i; igb_rx_input(rxr, ifp, sendmp, ptype); i = rxr->next_to_check; rxdone++; } /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { igb_refresh_mbufs(rxr, i); processed = 0; } } /* Catch any remainders */ if (igb_rx_unrefreshed(rxr)) igb_refresh_mbufs(rxr, i); rxr->next_to_check = i; /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } if (done != NULL) *done += rxdone; IGB_RX_UNLOCK(rxr); return ((staterr & E1000_RXD_STAT_DD) ? TRUE : FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void igb_rx_checksum(u32 staterr, struct mbuf *mp, u32 ptype) { u16 status = (u16)staterr; u8 errors = (u8) (staterr >> 24); int sctp; /* Ignore Checksum bit is set */ if (status & E1000_RXD_STAT_IXSM) { mp->m_pkthdr.csum_flags = 0; return; } if ((ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & E1000_RXDADV_PKTTYPE_SCTP) != 0) sctp = 1; else sctp = 0; if (status & E1000_RXD_STAT_IPCS) { /* Did it pass? */ if (!(errors & E1000_RXD_ERR_IPE)) { /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } else mp->m_pkthdr.csum_flags = 0; } if (status & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) { u64 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); #if __FreeBSD_version >= 800000 if (sctp) /* reassign */ type = CSUM_SCTP_VALID; #endif /* Did it pass? */ if (!(errors & E1000_RXD_ERR_TCPE)) { mp->m_pkthdr.csum_flags |= type; if (sctp == 0) mp->m_pkthdr.csum_data = htons(0xffff); } } return; } /* * This routine is run via an vlan * config EVENT */ static void igb_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IGB_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } /* * This routine is run via an vlan * unconfig EVENT */ static void igb_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IGB_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_setup_vlan_hw_support(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 reg; if (adapter->vf_ifp) { e1000_rlpml_set_vf(hw, adapter->max_frame_size + VLAN_TAG_SIZE); return; } reg = E1000_READ_REG(hw, E1000_CTRL); reg |= E1000_CTRL_VME; E1000_WRITE_REG(hw, E1000_CTRL, reg); /* Enable the Filter Table */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { reg = E1000_READ_REG(hw, E1000_RCTL); reg &= ~E1000_RCTL_CFIEN; reg |= E1000_RCTL_VFE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } /* Update the frame size */ E1000_WRITE_REG(&adapter->hw, E1000_RLPML, adapter->max_frame_size + VLAN_TAG_SIZE); /* Don't bother with table if no vlans */ if ((adapter->num_vlans == 0) || ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0)) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IGB_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) { if (adapter->vf_ifp) e1000_vfta_set_vf(hw, adapter->shadow_vfta[i], TRUE); else e1000_write_vfta(hw, i, adapter->shadow_vfta[i]); } } static void igb_enable_intr(struct adapter *adapter) { /* With RSS set up what to auto clear */ if (adapter->msix_mem) { u32 mask = (adapter->que_mask | adapter->link_mask); E1000_WRITE_REG(&adapter->hw, E1000_EIAC, mask); E1000_WRITE_REG(&adapter->hw, E1000_EIAM, mask); E1000_WRITE_REG(&adapter->hw, E1000_EIMS, mask); E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); } else { E1000_WRITE_REG(&adapter->hw, E1000_IMS, IMS_ENABLE_MASK); } E1000_WRITE_FLUSH(&adapter->hw); return; } static void igb_disable_intr(struct adapter *adapter) { if (adapter->msix_mem) { E1000_WRITE_REG(&adapter->hw, E1000_EIMC, ~0); E1000_WRITE_REG(&adapter->hw, E1000_EIAC, 0); } E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0); E1000_WRITE_FLUSH(&adapter->hw); return; } /* * Bit of a misnomer, what this really means is * to enable OS management of the system... aka * to disable special hardware management features */ static void igb_init_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc2h = E1000_READ_REG(&adapter->hw, E1000_MANC2H); int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* disable hardware interception of ARP */ manc &= ~(E1000_MANC_ARP_EN); /* enable receiving management packets to the host */ manc |= E1000_MANC_EN_MNG2HOST; manc2h |= 1 << 5; /* Mng Port 623 */ manc2h |= 1 << 6; /* Mng Port 664 */ E1000_WRITE_REG(&adapter->hw, E1000_MANC2H, manc2h); E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * Give control back to hardware management * controller if there is one. */ static void igb_release_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* re-enable hardware interception of ARP */ manc |= E1000_MANC_ARP_EN; manc &= ~E1000_MANC_EN_MNG2HOST; E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * igb_get_hw_control sets CTRL_EXT:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that * the driver is loaded. * */ static void igb_get_hw_control(struct adapter *adapter) { u32 ctrl_ext; if (adapter->vf_ifp) return; /* Let firmware know the driver has taken over */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext | E1000_CTRL_EXT_DRV_LOAD); } /* * igb_release_hw_control resets CTRL_EXT:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that the * driver is no longer loaded. * */ static void igb_release_hw_control(struct adapter *adapter) { u32 ctrl_ext; if (adapter->vf_ifp) return; /* Let firmware taken over control of h/w */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD); } static int igb_is_valid_ether_addr(uint8_t *addr) { char zero_addr[6] = { 0, 0, 0, 0, 0, 0 }; if ((addr[0] & 1) || (!bcmp(addr, zero_addr, ETHER_ADDR_LEN))) { return (FALSE); } return (TRUE); } /* * Enable PCI Wake On Lan capability */ static void igb_enable_wakeup(device_t dev) { u16 cap, status; u8 id; /* First find the capabilities pointer*/ cap = pci_read_config(dev, PCIR_CAP_PTR, 2); /* Read the PM Capabilities */ id = pci_read_config(dev, cap, 1); if (id != PCIY_PMG) /* Something wrong */ return; /* OK, we have the power capabilities, so now get the status register */ cap += PCIR_POWER_STATUS; status = pci_read_config(dev, cap, 2); status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE; pci_write_config(dev, cap, status, 2); return; } static void igb_led_func(void *arg, int onoff) { struct adapter *adapter = arg; IGB_CORE_LOCK(adapter); if (onoff) { e1000_setup_led(&adapter->hw); e1000_led_on(&adapter->hw); } else { e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } IGB_CORE_UNLOCK(adapter); } static uint64_t igb_get_counter(if_t ifp, ift_counter cnt) { struct adapter *adapter; struct e1000_hw_stats *stats; adapter = if_getsoftc(ifp); stats = (struct e1000_hw_stats *)adapter->stats; switch (cnt) { case IFCOUNTER_IERRORS: return (adapter->dropped_pkts + stats->rxerrc + stats->crcerrs + stats->algnerrc + stats->ruc + stats->roc + stats->mpc + stats->cexterr); case IFCOUNTER_OERRORS: return (stats->ecol + stats->latecol + adapter->watchdog_events); case IFCOUNTER_COLLISIONS: return (stats->colc); default: return (if_get_counter_default(ifp, cnt)); } } /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void igb_update_stats_counters(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_hw_stats *stats; /* ** The virtual function adapter has only a ** small controlled set of stats, do only ** those and return. */ if (adapter->vf_ifp) { igb_update_vf_stats_counters(adapter); return; } stats = (struct e1000_hw_stats *)adapter->stats; if(adapter->hw.phy.media_type == e1000_media_type_copper || (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU)) { stats->symerrs += E1000_READ_REG(hw,E1000_SYMERRS); stats->sec += E1000_READ_REG(hw, E1000_SEC); } stats->crcerrs += E1000_READ_REG(hw, E1000_CRCERRS); stats->mpc += E1000_READ_REG(hw, E1000_MPC); stats->scc += E1000_READ_REG(hw, E1000_SCC); stats->ecol += E1000_READ_REG(hw, E1000_ECOL); stats->mcc += E1000_READ_REG(hw, E1000_MCC); stats->latecol += E1000_READ_REG(hw, E1000_LATECOL); stats->colc += E1000_READ_REG(hw, E1000_COLC); stats->dc += E1000_READ_REG(hw, E1000_DC); stats->rlec += E1000_READ_REG(hw, E1000_RLEC); stats->xonrxc += E1000_READ_REG(hw, E1000_XONRXC); stats->xontxc += E1000_READ_REG(hw, E1000_XONTXC); /* ** For watchdog management we need to know if we have been ** paused during the last interval, so capture that here. */ adapter->pause_frames = E1000_READ_REG(&adapter->hw, E1000_XOFFRXC); stats->xoffrxc += adapter->pause_frames; stats->xofftxc += E1000_READ_REG(hw, E1000_XOFFTXC); stats->fcruc += E1000_READ_REG(hw, E1000_FCRUC); stats->prc64 += E1000_READ_REG(hw, E1000_PRC64); stats->prc127 += E1000_READ_REG(hw, E1000_PRC127); stats->prc255 += E1000_READ_REG(hw, E1000_PRC255); stats->prc511 += E1000_READ_REG(hw, E1000_PRC511); stats->prc1023 += E1000_READ_REG(hw, E1000_PRC1023); stats->prc1522 += E1000_READ_REG(hw, E1000_PRC1522); stats->gprc += E1000_READ_REG(hw, E1000_GPRC); stats->bprc += E1000_READ_REG(hw, E1000_BPRC); stats->mprc += E1000_READ_REG(hw, E1000_MPRC); stats->gptc += E1000_READ_REG(hw, E1000_GPTC); /* For the 64-bit byte counters the low dword must be read first. */ /* Both registers clear on the read of the high dword */ stats->gorc += E1000_READ_REG(hw, E1000_GORCL) + ((u64)E1000_READ_REG(hw, E1000_GORCH) << 32); stats->gotc += E1000_READ_REG(hw, E1000_GOTCL) + ((u64)E1000_READ_REG(hw, E1000_GOTCH) << 32); stats->rnbc += E1000_READ_REG(hw, E1000_RNBC); stats->ruc += E1000_READ_REG(hw, E1000_RUC); stats->rfc += E1000_READ_REG(hw, E1000_RFC); stats->roc += E1000_READ_REG(hw, E1000_ROC); stats->rjc += E1000_READ_REG(hw, E1000_RJC); stats->mgprc += E1000_READ_REG(hw, E1000_MGTPRC); stats->mgpdc += E1000_READ_REG(hw, E1000_MGTPDC); stats->mgptc += E1000_READ_REG(hw, E1000_MGTPTC); stats->tor += E1000_READ_REG(hw, E1000_TORL) + ((u64)E1000_READ_REG(hw, E1000_TORH) << 32); stats->tot += E1000_READ_REG(hw, E1000_TOTL) + ((u64)E1000_READ_REG(hw, E1000_TOTH) << 32); stats->tpr += E1000_READ_REG(hw, E1000_TPR); stats->tpt += E1000_READ_REG(hw, E1000_TPT); stats->ptc64 += E1000_READ_REG(hw, E1000_PTC64); stats->ptc127 += E1000_READ_REG(hw, E1000_PTC127); stats->ptc255 += E1000_READ_REG(hw, E1000_PTC255); stats->ptc511 += E1000_READ_REG(hw, E1000_PTC511); stats->ptc1023 += E1000_READ_REG(hw, E1000_PTC1023); stats->ptc1522 += E1000_READ_REG(hw, E1000_PTC1522); stats->mptc += E1000_READ_REG(hw, E1000_MPTC); stats->bptc += E1000_READ_REG(hw, E1000_BPTC); /* Interrupt Counts */ stats->iac += E1000_READ_REG(hw, E1000_IAC); stats->icrxptc += E1000_READ_REG(hw, E1000_ICRXPTC); stats->icrxatc += E1000_READ_REG(hw, E1000_ICRXATC); stats->ictxptc += E1000_READ_REG(hw, E1000_ICTXPTC); stats->ictxatc += E1000_READ_REG(hw, E1000_ICTXATC); stats->ictxqec += E1000_READ_REG(hw, E1000_ICTXQEC); stats->ictxqmtc += E1000_READ_REG(hw, E1000_ICTXQMTC); stats->icrxdmtc += E1000_READ_REG(hw, E1000_ICRXDMTC); stats->icrxoc += E1000_READ_REG(hw, E1000_ICRXOC); /* Host to Card Statistics */ stats->cbtmpc += E1000_READ_REG(hw, E1000_CBTMPC); stats->htdpmc += E1000_READ_REG(hw, E1000_HTDPMC); stats->cbrdpc += E1000_READ_REG(hw, E1000_CBRDPC); stats->cbrmpc += E1000_READ_REG(hw, E1000_CBRMPC); stats->rpthc += E1000_READ_REG(hw, E1000_RPTHC); stats->hgptc += E1000_READ_REG(hw, E1000_HGPTC); stats->htcbdpc += E1000_READ_REG(hw, E1000_HTCBDPC); stats->hgorc += (E1000_READ_REG(hw, E1000_HGORCL) + ((u64)E1000_READ_REG(hw, E1000_HGORCH) << 32)); stats->hgotc += (E1000_READ_REG(hw, E1000_HGOTCL) + ((u64)E1000_READ_REG(hw, E1000_HGOTCH) << 32)); stats->lenerrs += E1000_READ_REG(hw, E1000_LENERRS); stats->scvpc += E1000_READ_REG(hw, E1000_SCVPC); stats->hrmpc += E1000_READ_REG(hw, E1000_HRMPC); stats->algnerrc += E1000_READ_REG(hw, E1000_ALGNERRC); stats->rxerrc += E1000_READ_REG(hw, E1000_RXERRC); stats->tncrs += E1000_READ_REG(hw, E1000_TNCRS); stats->cexterr += E1000_READ_REG(hw, E1000_CEXTERR); stats->tsctc += E1000_READ_REG(hw, E1000_TSCTC); stats->tsctfc += E1000_READ_REG(hw, E1000_TSCTFC); /* Driver specific counters */ adapter->device_control = E1000_READ_REG(hw, E1000_CTRL); adapter->rx_control = E1000_READ_REG(hw, E1000_RCTL); adapter->int_mask = E1000_READ_REG(hw, E1000_IMS); adapter->eint_mask = E1000_READ_REG(hw, E1000_EIMS); adapter->packet_buf_alloc_tx = ((E1000_READ_REG(hw, E1000_PBA) & 0xffff0000) >> 16); adapter->packet_buf_alloc_rx = (E1000_READ_REG(hw, E1000_PBA) & 0xffff); } /********************************************************************** * * Initialize the VF board statistics counters. * **********************************************************************/ static void igb_vf_init_stats(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_vf_stats *stats; stats = (struct e1000_vf_stats *)adapter->stats; if (stats == NULL) return; stats->last_gprc = E1000_READ_REG(hw, E1000_VFGPRC); stats->last_gorc = E1000_READ_REG(hw, E1000_VFGORC); stats->last_gptc = E1000_READ_REG(hw, E1000_VFGPTC); stats->last_gotc = E1000_READ_REG(hw, E1000_VFGOTC); stats->last_mprc = E1000_READ_REG(hw, E1000_VFMPRC); } /********************************************************************** * * Update the VF board statistics counters. * **********************************************************************/ static void igb_update_vf_stats_counters(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_vf_stats *stats; if (adapter->link_speed == 0) return; stats = (struct e1000_vf_stats *)adapter->stats; UPDATE_VF_REG(E1000_VFGPRC, stats->last_gprc, stats->gprc); UPDATE_VF_REG(E1000_VFGORC, stats->last_gorc, stats->gorc); UPDATE_VF_REG(E1000_VFGPTC, stats->last_gptc, stats->gptc); UPDATE_VF_REG(E1000_VFGOTC, stats->last_gotc, stats->gotc); UPDATE_VF_REG(E1000_VFMPRC, stats->last_mprc, stats->mprc); } /* Export a single 32-bit register via a read-only sysctl. */ static int igb_sysctl_reg_handler(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; u_int val; adapter = oidp->oid_arg1; val = E1000_READ_REG(&adapter->hw, oidp->oid_arg2); return (sysctl_handle_int(oidp, &val, 0, req)); } /* ** Tuneable interrupt rate handler */ static int igb_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) { struct igb_queue *que = ((struct igb_queue *)oidp->oid_arg1); int error; u32 reg, usec, rate; reg = E1000_READ_REG(&que->adapter->hw, E1000_EITR(que->msix)); usec = ((reg & 0x7FFC) >> 2); if (usec > 0) rate = 1000000 / usec; else rate = 0; error = sysctl_handle_int(oidp, &rate, 0, req); if (error || !req->newptr) return error; return 0; } /* * Add sysctl variables, one per statistic, to the system. */ static void igb_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct e1000_hw_stats *stats = adapter->stats; struct sysctl_oid *stat_node, *queue_node, *int_node, *host_node; struct sysctl_oid_list *stat_list, *queue_list, *int_list, *host_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, "Link MSIX IRQ Handled"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_dma_fail", CTLFLAG_RD, &adapter->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_overruns", CTLFLAG_RD, &adapter->rx_overruns, "RX overruns"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_timeouts", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "device_control", CTLFLAG_RD, &adapter->device_control, "Device Control Register"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_control", CTLFLAG_RD, &adapter->rx_control, "Receiver Control Register"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "interrupt_mask", CTLFLAG_RD, &adapter->int_mask, "Interrupt Mask"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "extended_int_mask", CTLFLAG_RD, &adapter->eint_mask, "Extended Interrupt Mask"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_buf_alloc", CTLFLAG_RD, &adapter->packet_buf_alloc_tx, "Transmit Buffer Packet Allocation"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_buf_alloc", CTLFLAG_RD, &adapter->packet_buf_alloc_rx, "Receive Buffer Packet Allocation"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_high_water", CTLFLAG_RD, &adapter->hw.fc.high_water, 0, "Flow Control High Watermark"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_low_water", CTLFLAG_RD, &adapter->hw.fc.low_water, 0, "Flow Control Low Watermark"); for (int i = 0; i < adapter->num_queues; i++, rxr++, txr++) { struct lro_ctrl *lro = &rxr->lro; snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", CTLTYPE_UINT | CTLFLAG_RD, &adapter->queues[i], sizeof(&adapter->queues[i]), igb_sysctl_interrupt_rate_handler, "IU", "Interrupt Rate"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDH(txr->me), igb_sysctl_reg_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDT(txr->me), igb_sysctl_reg_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue Descriptors Unavailable"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &txr->total_packets, "Queue Packets Transmitted"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDH(rxr->me), igb_sysctl_reg_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDT(rxr->me), igb_sysctl_reg_handler, "IU", "Receive Descriptor Tail"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &rxr->rx_packets, "Queue Packets Received"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_flushed", CTLFLAG_RD, &lro->lro_flushed, 0, "LRO Flushed"); } /* MAC stats get their own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); /* ** VF adapter has a very limited set of stats ** since its not managing the metal, so to speak. */ if (adapter->vf_ifp) { SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); return; } SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "excess_coll", CTLFLAG_RD, &stats->ecol, "Excessive collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "single_coll", CTLFLAG_RD, &stats->scc, "Single collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "multiple_coll", CTLFLAG_RD, &stats->mcc, "Multiple collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "late_coll", CTLFLAG_RD, &stats->latecol, "Late collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "collision_count", CTLFLAG_RD, &stats->colc, "Collision Count"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "symbol_errors", CTLFLAG_RD, &stats->symerrs, "Symbol Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "sequence_errors", CTLFLAG_RD, &stats->sec, "Sequence Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "defer_count", CTLFLAG_RD, &stats->dc, "Defer Count"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "missed_packets", CTLFLAG_RD, &stats->mpc, "Missed Packets"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_length_errors", CTLFLAG_RD, &stats->rlec, "Receive Length Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_no_buff", CTLFLAG_RD, &stats->rnbc, "Receive No Buffers"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_undersize", CTLFLAG_RD, &stats->ruc, "Receive Undersize"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &stats->rfc, "Fragmented Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_oversize", CTLFLAG_RD, &stats->roc, "Oversized Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_jabber", CTLFLAG_RD, &stats->rjc, "Recevied Jabber"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_errs", CTLFLAG_RD, &stats->rxerrc, "Receive Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &stats->crcerrs, "CRC errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "alignment_errs", CTLFLAG_RD, &stats->algnerrc, "Alignment Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_no_crs", CTLFLAG_RD, &stats->tncrs, "Transmit with No CRS"); /* On 82575 these are collision counts */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "coll_ext_errs", CTLFLAG_RD, &stats->cexterr, "Collision/Carrier extension errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &stats->xonrxc, "XON Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &stats->xontxc, "XON Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &stats->xoffrxc, "XOFF Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &stats->xofftxc, "XOFF Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "unsupported_fc_recvd", CTLFLAG_RD, &stats->fcruc, "Unsupported Flow Control Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mgmt_pkts_recvd", CTLFLAG_RD, &stats->mgprc, "Management Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mgmt_pkts_drop", CTLFLAG_RD, &stats->mgpdc, "Management Packets Dropped"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mgmt_pkts_txd", CTLFLAG_RD, &stats->mgptc, "Management Packets Transmitted"); /* Packet Reception Stats */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_pkts_recvd", CTLFLAG_RD, &stats->tpr, "Total Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_recvd", CTLFLAG_RD, &stats->bprc, "Broadcast Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &stats->prc64, "64 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &stats->prc127, "65-127 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &stats->prc255, "128-255 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &stats->prc511, "256-511 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &stats->prc1023, "512-1023 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &stats->prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_octets_recvd", CTLFLAG_RD, &stats->tor, "Total Octets Received"); /* Packet Transmission Stats */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_octets_txd", CTLFLAG_RD, &stats->tot, "Total Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &stats->tpt, "Total Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &stats->bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &stats->mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &stats->ptc64, "64 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &stats->ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &stats->ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &stats->ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &stats->ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &stats->ptc1522, "1024-1522 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tso_txd", CTLFLAG_RD, &stats->tsctc, "TSO Contexts Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tso_ctx_fail", CTLFLAG_RD, &stats->tsctfc, "TSO Contexts Failed"); /* Interrupt Stats */ int_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "interrupts", CTLFLAG_RD, NULL, "Interrupt Statistics"); int_list = SYSCTL_CHILDREN(int_node); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "asserts", CTLFLAG_RD, &stats->iac, "Interrupt Assertion Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_pkt_timer", CTLFLAG_RD, &stats->icrxptc, "Interrupt Cause Rx Pkt Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_abs_timer", CTLFLAG_RD, &stats->icrxatc, "Interrupt Cause Rx Abs Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_pkt_timer", CTLFLAG_RD, &stats->ictxptc, "Interrupt Cause Tx Pkt Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_abs_timer", CTLFLAG_RD, &stats->ictxatc, "Interrupt Cause Tx Abs Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_queue_empty", CTLFLAG_RD, &stats->ictxqec, "Interrupt Cause Tx Queue Empty Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_queue_min_thresh", CTLFLAG_RD, &stats->ictxqmtc, "Interrupt Cause Tx Queue Min Thresh Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_desc_min_thresh", CTLFLAG_RD, &stats->icrxdmtc, "Interrupt Cause Rx Desc Min Thresh Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_overrun", CTLFLAG_RD, &stats->icrxoc, "Interrupt Cause Receiver Overrun Count"); /* Host to Card Stats */ host_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "host", CTLFLAG_RD, NULL, "Host to Card Statistics"); host_list = SYSCTL_CHILDREN(host_node); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_tx_pkt", CTLFLAG_RD, &stats->cbtmpc, "Circuit Breaker Tx Packet Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "host_tx_pkt_discard", CTLFLAG_RD, &stats->htdpmc, "Host Transmit Discarded Packets"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "rx_pkt", CTLFLAG_RD, &stats->rpthc, "Rx Packets To Host"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_rx_pkts", CTLFLAG_RD, &stats->cbrmpc, "Circuit Breaker Rx Packet Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_rx_pkt_drop", CTLFLAG_RD, &stats->cbrdpc, "Circuit Breaker Rx Dropped Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "tx_good_pkt", CTLFLAG_RD, &stats->hgptc, "Host Good Packets Tx Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_tx_pkt_drop", CTLFLAG_RD, &stats->htcbdpc, "Host Tx Circuit Breaker Dropped Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "rx_good_bytes", CTLFLAG_RD, &stats->hgorc, "Host Good Octets Received Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "tx_good_bytes", CTLFLAG_RD, &stats->hgotc, "Host Good Octets Transmit Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "length_errors", CTLFLAG_RD, &stats->lenerrs, "Length Errors"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "serdes_violation_pkt", CTLFLAG_RD, &stats->scvpc, "SerDes/SGMII Code Violation Pkt Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "header_redir_missed", CTLFLAG_RD, &stats->hrmpc, "Header Redirection Missed Packet Count"); } /********************************************************************** * * This routine provides a way to dump out the adapter eeprom, * often a useful debug/service tool. This only dumps the first * 32 words, stuff that matters is in that extent. * **********************************************************************/ static int igb_sysctl_nvm_info(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; int error; int result; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); /* * This value will cause a hex dump of the * first 32 16-bit words of the EEPROM to * the screen. */ if (result == 1) { adapter = (struct adapter *)arg1; igb_print_nvm_info(adapter); } return (error); } static void igb_print_nvm_info(struct adapter *adapter) { u16 eeprom_data; int i, j, row = 0; /* Its a bit crude, but it gets the job done */ printf("\nInterface EEPROM Dump:\n"); printf("Offset\n0x0000 "); for (i = 0, j = 0; i < 32; i++, j++) { if (j == 8) { /* Make the offset block */ j = 0; ++row; printf("\n0x00%x0 ",row); } e1000_read_nvm(&adapter->hw, i, 1, &eeprom_data); printf("%04x ", eeprom_data); } printf("\n"); } static void igb_set_sysctl_value(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLFLAG_RW, limit, value, description); } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int igb_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error; static int input = 3; /* default is full */ struct adapter *adapter = (struct adapter *) arg1; error = sysctl_handle_int(oidp, &input, 0, req); if ((error) || (req->newptr == NULL)) return (error); switch (input) { case e1000_fc_rx_pause: case e1000_fc_tx_pause: case e1000_fc_full: case e1000_fc_none: adapter->hw.fc.requested_mode = input; adapter->fc = input; break; default: /* Do nothing */ return (error); } adapter->hw.fc.current_mode = adapter->hw.fc.requested_mode; e1000_force_mac_fc(&adapter->hw); /* XXX TODO: update DROP_EN on each RX queue if appropriate */ return (error); } /* ** Manage DMA Coalesce: ** Control values: ** 0/1 - off/on ** Legal timer values are: ** 250,500,1000-10000 in thousands */ static int igb_sysctl_dmac(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error; error = sysctl_handle_int(oidp, &adapter->dmac, 0, req); if ((error) || (req->newptr == NULL)) return (error); switch (adapter->dmac) { case 0: /*Disabling */ break; case 1: /* Just enable and use default */ adapter->dmac = 1000; break; case 250: case 500: case 1000: case 2000: case 3000: case 4000: case 5000: case 6000: case 7000: case 8000: case 9000: case 10000: /* Legal values - allow */ break; default: /* Do nothing, illegal value */ adapter->dmac = 0; return (EINVAL); } /* Reinit the interface */ igb_init(adapter); return (error); } /* ** Manage Energy Efficient Ethernet: ** Control values: ** 0/1 - enabled/disabled */ static int igb_sysctl_eee(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error, value; value = adapter->hw.dev_spec._82575.eee_disable; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); IGB_CORE_LOCK(adapter); adapter->hw.dev_spec._82575.eee_disable = (value != 0); igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); return (0); } static int igb_per_unit_num_queues(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; adapter = (struct adapter *) arg1; return sysctl_handle_int(oidp, &adapter->num_queues, 0, req); } Index: head/sys/dev/ixgbe/ixgbe.c =================================================================== --- head/sys/dev/ixgbe/ixgbe.c (revision 275357) +++ head/sys/dev/ixgbe/ixgbe.c (revision 275358) @@ -1,6089 +1,6089 @@ /****************************************************************************** Copyright (c) 2001-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include "ixgbe.h" #ifdef RSS #include #endif /********************************************************************* * Set this to one to display debug statistics *********************************************************************/ int ixgbe_display_debug_stats = 0; /********************************************************************* * Driver version *********************************************************************/ char ixgbe_driver_version[] = "2.5.15"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into ixgbe_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static ixgbe_vendor_info_t ixgbe_vendor_info_array[] = { {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AF_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AF_SINGLE_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_CX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AT2, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_DA_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_CX4_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_XF_LR, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_SFP_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_KX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_KX4_MEZZ, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_XAUI_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_CX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_T3_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_COMBO_BACKPLANE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_BACKPLANE_FCOE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_SF2, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_FCOE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599EN_SFP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_SF_QP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X540T, 0, 0, 0}, /* required last entry */ {0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings *********************************************************************/ static char *ixgbe_strings[] = { "Intel(R) PRO/10GbE PCI-Express Network Driver" }; /********************************************************************* * Function prototypes *********************************************************************/ static int ixgbe_per_unit_num_queues(SYSCTL_HANDLER_ARGS); static int ixgbe_probe(device_t); static int ixgbe_attach(device_t); static int ixgbe_detach(device_t); static int ixgbe_shutdown(device_t); #ifdef IXGBE_LEGACY_TX static void ixgbe_start(struct ifnet *); static void ixgbe_start_locked(struct tx_ring *, struct ifnet *); #else /* ! IXGBE_LEGACY_TX */ static int ixgbe_mq_start(struct ifnet *, struct mbuf *); static int ixgbe_mq_start_locked(struct ifnet *, struct tx_ring *); static void ixgbe_qflush(struct ifnet *); static void ixgbe_deferred_mq_start(void *, int); #endif /* IXGBE_LEGACY_TX */ static int ixgbe_ioctl(struct ifnet *, u_long, caddr_t); static void ixgbe_init(void *); static void ixgbe_init_locked(struct adapter *); static void ixgbe_stop(void *); static uint64_t ixgbe_get_counter(struct ifnet *, ift_counter); static void ixgbe_media_status(struct ifnet *, struct ifmediareq *); static int ixgbe_media_change(struct ifnet *); static void ixgbe_identify_hardware(struct adapter *); static int ixgbe_allocate_pci_resources(struct adapter *); static void ixgbe_get_slot_info(struct ixgbe_hw *); static int ixgbe_allocate_msix(struct adapter *); static int ixgbe_allocate_legacy(struct adapter *); static int ixgbe_allocate_queues(struct adapter *); static int ixgbe_setup_msix(struct adapter *); static void ixgbe_free_pci_resources(struct adapter *); static void ixgbe_local_timer(void *); static int ixgbe_setup_interface(device_t, struct adapter *); static void ixgbe_config_link(struct adapter *); static int ixgbe_allocate_transmit_buffers(struct tx_ring *); static int ixgbe_setup_transmit_structures(struct adapter *); static void ixgbe_setup_transmit_ring(struct tx_ring *); static void ixgbe_initialize_transmit_units(struct adapter *); static void ixgbe_free_transmit_structures(struct adapter *); static void ixgbe_free_transmit_buffers(struct tx_ring *); static int ixgbe_allocate_receive_buffers(struct rx_ring *); static int ixgbe_setup_receive_structures(struct adapter *); static int ixgbe_setup_receive_ring(struct rx_ring *); static void ixgbe_initialize_receive_units(struct adapter *); static void ixgbe_free_receive_structures(struct adapter *); static void ixgbe_free_receive_buffers(struct rx_ring *); static void ixgbe_setup_hw_rsc(struct rx_ring *); static void ixgbe_enable_intr(struct adapter *); static void ixgbe_disable_intr(struct adapter *); static void ixgbe_update_stats_counters(struct adapter *); static void ixgbe_txeof(struct tx_ring *); static bool ixgbe_rxeof(struct ix_queue *); static void ixgbe_rx_checksum(u32, struct mbuf *, u32); static void ixgbe_set_promisc(struct adapter *); static void ixgbe_set_multi(struct adapter *); static void ixgbe_update_link_status(struct adapter *); static void ixgbe_refresh_mbufs(struct rx_ring *, int); static int ixgbe_xmit(struct tx_ring *, struct mbuf **); static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS); static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS); static int ixgbe_set_thermal_test(SYSCTL_HANDLER_ARGS); static int ixgbe_dma_malloc(struct adapter *, bus_size_t, struct ixgbe_dma_alloc *, int); static void ixgbe_dma_free(struct adapter *, struct ixgbe_dma_alloc *); static int ixgbe_tx_ctx_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static int ixgbe_tso_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static void ixgbe_set_ivar(struct adapter *, u8, u8, s8); static void ixgbe_configure_ivars(struct adapter *); static u8 * ixgbe_mc_array_itr(struct ixgbe_hw *, u8 **, u32 *); static void ixgbe_setup_vlan_hw_support(struct adapter *); static void ixgbe_register_vlan(void *, struct ifnet *, u16); static void ixgbe_unregister_vlan(void *, struct ifnet *, u16); static void ixgbe_add_hw_stats(struct adapter *adapter); static __inline void ixgbe_rx_discard(struct rx_ring *, int); static __inline void ixgbe_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u32); static void ixgbe_enable_rx_drop(struct adapter *); static void ixgbe_disable_rx_drop(struct adapter *); /* Support for pluggable optic modules */ static bool ixgbe_sfp_probe(struct adapter *); static void ixgbe_setup_optics(struct adapter *); /* Legacy (single vector interrupt handler */ static void ixgbe_legacy_irq(void *); /* The MSI/X Interrupt handlers */ static void ixgbe_msix_que(void *); static void ixgbe_msix_link(void *); /* Deferred interrupt tasklets */ static void ixgbe_handle_que(void *, int); static void ixgbe_handle_link(void *, int); static void ixgbe_handle_msf(void *, int); static void ixgbe_handle_mod(void *, int); #ifdef IXGBE_FDIR static void ixgbe_atr(struct tx_ring *, struct mbuf *); static void ixgbe_reinit_fdir(void *, int); #endif /* Missing shared code prototype */ extern void ixgbe_stop_mac_link_on_d3_82599(struct ixgbe_hw *hw); /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t ixgbe_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ixgbe_probe), DEVMETHOD(device_attach, ixgbe_attach), DEVMETHOD(device_detach, ixgbe_detach), DEVMETHOD(device_shutdown, ixgbe_shutdown), DEVMETHOD_END }; static driver_t ixgbe_driver = { "ix", ixgbe_methods, sizeof(struct adapter), }; devclass_t ixgbe_devclass; DRIVER_MODULE(ixgbe, pci, ixgbe_driver, ixgbe_devclass, 0, 0); MODULE_DEPEND(ixgbe, pci, 1, 1, 1); MODULE_DEPEND(ixgbe, ether, 1, 1, 1); /* ** TUNEABLE PARAMETERS: */ static SYSCTL_NODE(_hw, OID_AUTO, ix, CTLFLAG_RD, 0, "IXGBE driver parameters"); /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int ixgbe_enable_aim = TRUE; SYSCTL_INT(_hw_ix, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &ixgbe_enable_aim, 0, "Enable adaptive interrupt moderation"); static int ixgbe_max_interrupt_rate = (4000000 / IXGBE_LOW_LATENCY); SYSCTL_INT(_hw_ix, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, &ixgbe_max_interrupt_rate, 0, "Maximum interrupts per second"); /* How many packets rxeof tries to clean at a time */ static int ixgbe_rx_process_limit = 256; SYSCTL_INT(_hw_ix, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &ixgbe_rx_process_limit, 0, "Maximum number of received packets to process at a time," "-1 means unlimited"); /* How many packets txeof tries to clean at a time */ static int ixgbe_tx_process_limit = 256; SYSCTL_INT(_hw_ix, OID_AUTO, tx_process_limit, CTLFLAG_RDTUN, &ixgbe_tx_process_limit, 0, "Maximum number of sent packets to process at a time," "-1 means unlimited"); /* ** Smart speed setting, default to on ** this only works as a compile option ** right now as its during attach, set ** this to 'ixgbe_smart_speed_off' to ** disable. */ static int ixgbe_smart_speed = ixgbe_smart_speed_on; /* * MSIX should be the default for best performance, * but this allows it to be forced off for testing. */ static int ixgbe_enable_msix = 1; SYSCTL_INT(_hw_ix, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &ixgbe_enable_msix, 0, "Enable MSI-X interrupts"); /* * Number of Queues, can be set to 0, * it then autoconfigures based on the * number of cpus with a max of 8. This * can be overriden manually here. */ static int ixgbe_num_queues = 0; SYSCTL_INT(_hw_ix, OID_AUTO, num_queues, CTLFLAG_RDTUN, &ixgbe_num_queues, 0, "Number of queues to configure, 0 indicates autoconfigure"); /* ** Number of TX descriptors per ring, ** setting higher than RX as this seems ** the better performing choice. */ static int ixgbe_txd = PERFORM_TXD; SYSCTL_INT(_hw_ix, OID_AUTO, txd, CTLFLAG_RDTUN, &ixgbe_txd, 0, "Number of transmit descriptors per queue"); /* Number of RX descriptors per ring */ static int ixgbe_rxd = PERFORM_RXD; SYSCTL_INT(_hw_ix, OID_AUTO, rxd, CTLFLAG_RDTUN, &ixgbe_rxd, 0, "Number of receive descriptors per queue"); /* ** Defining this on will allow the use ** of unsupported SFP+ modules, note that ** doing so you are on your own :) */ static int allow_unsupported_sfp = FALSE; TUNABLE_INT("hw.ix.unsupported_sfp", &allow_unsupported_sfp); /* ** HW RSC control: ** this feature only works with ** IPv4, and only on 82599 and later. ** Also this will cause IP forwarding to ** fail and that can't be controlled by ** the stack as LRO can. For all these ** reasons I've deemed it best to leave ** this off and not bother with a tuneable ** interface, this would need to be compiled ** to enable. */ static bool ixgbe_rsc_enable = FALSE; /* Keep running tab on them for sanity check */ static int ixgbe_total_ports; #ifdef IXGBE_FDIR /* ** For Flow Director: this is the ** number of TX packets we sample ** for the filter pool, this means ** every 20th packet will be probed. ** ** This feature can be disabled by ** setting this to 0. */ static int atr_sample_rate = 20; /* ** Flow Director actually 'steals' ** part of the packet buffer as its ** filter pool, this variable controls ** how much it uses: ** 0 = 64K, 1 = 128K, 2 = 256K */ static int fdir_pballoc = 1; #endif #ifdef DEV_NETMAP /* * The #ifdef DEV_NETMAP / #endif blocks in this file are meant to * be a reference on how to implement netmap support in a driver. * Additional comments are in ixgbe_netmap.h . * * contains functions for netmap support * that extend the standard driver. */ #include #endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * * ixgbe_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int ixgbe_probe(device_t dev) { ixgbe_vendor_info_t *ent; u16 pci_vendor_id = 0; u16 pci_device_id = 0; u16 pci_subvendor_id = 0; u16 pci_subdevice_id = 0; char adapter_name[256]; INIT_DEBUGOUT("ixgbe_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IXGBE_INTEL_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = ixgbe_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == 0)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == 0))) { sprintf(adapter_name, "%s, Version - %s", ixgbe_strings[ent->index], ixgbe_driver_version); device_set_desc_copy(dev, adapter_name); ++ixgbe_total_ports; return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int ixgbe_attach(device_t dev) { struct adapter *adapter; struct ixgbe_hw *hw; int error = 0; u16 csum; u32 ctrl_ext; INIT_DEBUGOUT("ixgbe_attach: begin"); /* Allocate, clear, and link in our adapter structure */ adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; hw = &adapter->hw; /* Core Lock Init*/ IXGBE_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL APIs */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fc", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_flowcntl, "I", "Flow Control"); SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "enable_aim", CTLFLAG_RW, &ixgbe_enable_aim, 1, "Interrupt Moderation"); /* ** Allow a kind of speed control by forcing the autoneg ** advertised speed list to only a certain value, this ** supports 1G on 82599 devices, and 100Mb on x540. */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "advertise_speed", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_advertise, "I", "Link Speed"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "num_queues", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, ixgbe_per_unit_num_queues, "I", "Number of Queues"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ts", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_thermal_test, "I", "Thermal Test"); /* Set up the timer callout */ callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware revision */ ixgbe_identify_hardware(adapter); /* Do base PCI setup - map BAR0 */ if (ixgbe_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_out; } /* Do descriptor calc and sanity checks */ if (((ixgbe_txd * sizeof(union ixgbe_adv_tx_desc)) % DBA_ALIGN) != 0 || ixgbe_txd < MIN_TXD || ixgbe_txd > MAX_TXD) { device_printf(dev, "TXD config issue, using default!\n"); adapter->num_tx_desc = DEFAULT_TXD; } else adapter->num_tx_desc = ixgbe_txd; /* ** With many RX rings it is easy to exceed the ** system mbuf allocation. Tuning nmbclusters ** can alleviate this. */ if (nmbclusters > 0 ) { int s; s = (ixgbe_rxd * adapter->num_queues) * ixgbe_total_ports; if (s > nmbclusters) { device_printf(dev, "RX Descriptors exceed " "system mbuf max, using default instead!\n"); ixgbe_rxd = DEFAULT_RXD; } } if (((ixgbe_rxd * sizeof(union ixgbe_adv_rx_desc)) % DBA_ALIGN) != 0 || ixgbe_rxd < MIN_RXD || ixgbe_rxd > MAX_RXD) { device_printf(dev, "RXD config issue, using default!\n"); adapter->num_rx_desc = DEFAULT_RXD; } else adapter->num_rx_desc = ixgbe_rxd; /* Allocate our TX/RX Queues */ if (ixgbe_allocate_queues(adapter)) { error = ENOMEM; goto err_out; } /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(u8) * IXGBE_ETH_LENGTH_OF_ADDRESS * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_late; } /* Initialize the shared code */ hw->allow_unsupported_sfp = allow_unsupported_sfp; error = ixgbe_init_shared_code(hw); if (error == IXGBE_ERR_SFP_NOT_PRESENT) { /* ** No optics in this port, set up ** so the timer routine will probe ** for later insertion. */ adapter->sfp_probe = TRUE; error = 0; } else if (error == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev,"Unsupported SFP+ module detected!\n"); error = EIO; goto err_late; } else if (error) { device_printf(dev,"Unable to initialize the shared code\n"); error = EIO; goto err_late; } /* Make sure we have a good EEPROM before we read from it */ if (ixgbe_validate_eeprom_checksum(&adapter->hw, &csum) < 0) { device_printf(dev,"The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_late; } error = ixgbe_init_hw(hw); switch (error) { case IXGBE_ERR_EEPROM_VERSION: device_printf(dev, "This device is a pre-production adapter/" "LOM. Please be aware there may be issues associated " "with your hardware.\n If you are experiencing problems " "please contact your Intel or hardware representative " "who provided you with this hardware.\n"); break; case IXGBE_ERR_SFP_NOT_SUPPORTED: device_printf(dev,"Unsupported SFP+ Module\n"); error = EIO; goto err_late; case IXGBE_ERR_SFP_NOT_PRESENT: device_printf(dev,"No SFP+ Module found\n"); /* falls thru */ default: break; } /* Detect and set physical type */ ixgbe_setup_optics(adapter); if ((adapter->msix > 1) && (ixgbe_enable_msix)) error = ixgbe_allocate_msix(adapter); else error = ixgbe_allocate_legacy(adapter); if (error) goto err_late; /* Setup OS specific network interface */ if (ixgbe_setup_interface(dev, adapter) != 0) goto err_late; /* Initialize statistics */ ixgbe_update_stats_counters(adapter); /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ixgbe_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ixgbe_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); /* ** Check PCIE slot type/speed/width */ ixgbe_get_slot_info(hw); /* Set an initial default flow control value */ adapter->fc = ixgbe_fc_full; /* let hardware know driver is loaded */ ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT); ctrl_ext |= IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl_ext); ixgbe_add_hw_stats(adapter); #ifdef DEV_NETMAP ixgbe_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("ixgbe_attach: end"); return (0); err_late: ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); err_out: if (adapter->ifp != NULL) if_free(adapter->ifp); ixgbe_free_pci_resources(adapter); free(adapter->mta, M_DEVBUF); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int ixgbe_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; u32 ctrl_ext; INIT_DEBUGOUT("ixgbe_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } IXGBE_CORE_LOCK(adapter); ixgbe_stop(adapter); IXGBE_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if (que->tq) { #ifndef IXGBE_LEGACY_TX taskqueue_drain(que->tq, &txr->txq_task); #endif taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } /* Drain the Link queue */ if (adapter->tq) { taskqueue_drain(adapter->tq, &adapter->link_task); taskqueue_drain(adapter->tq, &adapter->mod_task); taskqueue_drain(adapter->tq, &adapter->msf_task); #ifdef IXGBE_FDIR taskqueue_drain(adapter->tq, &adapter->fdir_task); #endif taskqueue_free(adapter->tq); } /* let hardware know driver is unloading */ ctrl_ext = IXGBE_READ_REG(&adapter->hw, IXGBE_CTRL_EXT); ctrl_ext &= ~IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(&adapter->hw, IXGBE_CTRL_EXT, ctrl_ext); /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); ether_ifdetach(adapter->ifp); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(adapter->ifp); #endif /* DEV_NETMAP */ ixgbe_free_pci_resources(adapter); bus_generic_detach(dev); if_free(adapter->ifp); ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); free(adapter->mta, M_DEVBUF); IXGBE_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int ixgbe_shutdown(device_t dev) { struct adapter *adapter = device_get_softc(dev); IXGBE_CORE_LOCK(adapter); ixgbe_stop(adapter); IXGBE_CORE_UNLOCK(adapter); return (0); } #ifdef IXGBE_LEGACY_TX /********************************************************************* * Transmit entry point * * ixgbe_start is called by the stack to initiate a transmit. * The driver will remain in this routine as long as there are * packets to transmit and transmit resources are available. * In case resources are not available stack is notified and * the packet is requeued. **********************************************************************/ static void ixgbe_start_locked(struct tx_ring *txr, struct ifnet * ifp) { struct mbuf *m_head; struct adapter *adapter = txr->adapter; IXGBE_TX_LOCK_ASSERT(txr); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; if (!adapter->link_active) return; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if (txr->tx_avail <= IXGBE_QUEUE_MIN_FREE) break; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (ixgbe_xmit(txr, &m_head)) { if (m_head != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set watchdog on */ txr->watchdog_time = ticks; txr->queue_status = IXGBE_QUEUE_WORKING; } return; } /* * Legacy TX start - called by the stack, this * always uses the first tx ring, and should * not be used with multiqueue tx enabled. */ static void ixgbe_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_TX_LOCK(txr); ixgbe_start_locked(txr, ifp); IXGBE_TX_UNLOCK(txr); } return; } #else /* ! IXGBE_LEGACY_TX */ /* ** Multiqueue Transmit driver ** */ static int ixgbe_mq_start(struct ifnet *ifp, struct mbuf *m) { struct adapter *adapter = ifp->if_softc; struct ix_queue *que; struct tx_ring *txr; int i, err = 0; #ifdef RSS uint32_t bucket_id; #endif /* Which queue to use */ /* * When doing RSS, map it to the same outbound queue * as the incoming flow would be mapped to. * * If everything is setup correctly, it should be the * same bucket that the current CPU we're on is. */ - if ((m->m_flags & M_FLOWID) != 0) { + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bucket_id) == 0) { /* XXX TODO: spit out something if bucket_id > num_queues? */ i = bucket_id % adapter->num_queues; } else { #endif i = m->m_pkthdr.flowid % adapter->num_queues; #ifdef RSS } #endif } else { i = curcpu % adapter->num_queues; } txr = &adapter->tx_rings[i]; que = &adapter->queues[i]; err = drbr_enqueue(ifp, txr->br, m); if (err) return (err); if (IXGBE_TX_TRYLOCK(txr)) { ixgbe_mq_start_locked(ifp, txr); IXGBE_TX_UNLOCK(txr); } else taskqueue_enqueue(que->tq, &txr->txq_task); return (0); } static int ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct mbuf *next; int enqueued = 0, err = 0; if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || adapter->link_active == 0) return (ENETDOWN); /* Process the queue */ #if __FreeBSD_version < 901504 next = drbr_dequeue(ifp, txr->br); while (next != NULL) { if ((err = ixgbe_xmit(txr, &next)) != 0) { if (next != NULL) err = drbr_enqueue(ifp, txr->br, next); #else while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = ixgbe_xmit(txr, &next)) != 0) { if (next == NULL) { drbr_advance(ifp, txr->br); } else { drbr_putback(ifp, txr->br, next); } #endif break; } #if __FreeBSD_version >= 901504 drbr_advance(ifp, txr->br); #endif enqueued++; /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; #if __FreeBSD_version < 901504 next = drbr_dequeue(ifp, txr->br); #endif } if (enqueued > 0) { /* Set watchdog on */ txr->queue_status = IXGBE_QUEUE_WORKING; txr->watchdog_time = ticks; } if (txr->tx_avail < IXGBE_TX_CLEANUP_THRESHOLD) ixgbe_txeof(txr); return (err); } /* * Called from a taskqueue to drain queued transmit packets. */ static void ixgbe_deferred_mq_start(void *arg, int pending) { struct tx_ring *txr = arg; struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; IXGBE_TX_LOCK(txr); if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); IXGBE_TX_UNLOCK(txr); } /* ** Flush all ring buffers */ static void ixgbe_qflush(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct mbuf *m; for (int i = 0; i < adapter->num_queues; i++, txr++) { IXGBE_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IXGBE_TX_UNLOCK(txr); } if_qflush(ifp); } #endif /* IXGBE_LEGACY_TX */ /********************************************************************* * Ioctl entry point * * ixgbe_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int ixgbe_ioctl(struct ifnet * ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ixgbe_hw *hw = &adapter->hw; struct ifreq *ifr = (struct ifreq *) data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; bool avoid_reset = FALSE; #endif int error = 0; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif #if defined(INET) || defined(INET6) /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ixgbe_init(adapter); if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); } else error = ether_ioctl(ifp, command, data); #endif break; case SIOCSIFMTU: IOCTL_DEBUGOUT("ioctl: SIOCSIFMTU (Set Interface MTU)"); if (ifr->ifr_mtu > IXGBE_MAX_FRAME_SIZE - ETHER_HDR_LEN) { error = EINVAL; } else { IXGBE_CORE_LOCK(adapter); ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl: SIOCSIFFLAGS (Set Interface Flags)"); IXGBE_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { ixgbe_set_promisc(adapter); } } else ixgbe_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ixgbe_stop(adapter); adapter->if_flags = ifp->if_flags; IXGBE_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixgbe_disable_intr(adapter); ixgbe_set_multi(adapter); ixgbe_enable_intr(adapter); IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl: SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask = ifr->ifr_reqcap ^ ifp->if_capenable; IOCTL_DEBUGOUT("ioctl: SIOCSIFCAP (Set Capabilities)"); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWFILTER) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } VLAN_CAPABILITIES(ifp); break; } case SIOCGI2C: { struct ifi2creq i2c; int i; IOCTL_DEBUGOUT("ioctl: SIOCGI2C (Get I2C Data)"); error = copyin(ifr->ifr_data, &i2c, sizeof(i2c)); if (error != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { error = EINVAL; break; } if (i2c.len > sizeof(i2c.data)) { error = EINVAL; break; } for (i = 0; i < i2c.len; i++) hw->phy.ops.read_i2c_byte(hw, i2c.offset + i, i2c.dev_addr, &i2c.data[i]); error = copyout(&i2c, ifr->ifr_data, sizeof(i2c)); break; } default: IOCTL_DEBUGOUT1("ioctl: UNKNOWN (0x%X)\n", (int)command); error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ #define IXGBE_MHADD_MFS_SHIFT 16 static void ixgbe_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; u32 k, txdctl, mhadd, gpie; u32 rxdctl, rxctrl; mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixgbe_init_locked: begin"); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); callout_stop(&adapter->timer); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(hw, 0, adapter->hw.mac.addr, 0, IXGBE_RAH_AV); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), hw->mac.addr, IXGBE_ETH_LENGTH_OF_ADDRESS); ixgbe_set_rar(hw, 0, hw->mac.addr, 0, 1); hw->addr_ctrl.rar_used_count = 1; /* Set the various hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 if (hw->mac.type != ixgbe_mac_82598EB) ifp->if_hwassist |= CSUM_SCTP; #endif } /* Prepare transmit descriptors and buffers */ if (ixgbe_setup_transmit_structures(adapter)) { device_printf(dev,"Could not setup transmit structures\n"); ixgbe_stop(adapter); return; } ixgbe_init_hw(hw); ixgbe_initialize_transmit_units(adapter); /* Setup Multicast table */ ixgbe_set_multi(adapter); /* ** Determine the correct mbuf pool ** for doing jumbo frames */ if (adapter->max_frame_size <= 2048) adapter->rx_mbuf_sz = MCLBYTES; else if (adapter->max_frame_size <= 4096) adapter->rx_mbuf_sz = MJUMPAGESIZE; else if (adapter->max_frame_size <= 9216) adapter->rx_mbuf_sz = MJUM9BYTES; else adapter->rx_mbuf_sz = MJUM16BYTES; /* Prepare receive descriptors and buffers */ if (ixgbe_setup_receive_structures(adapter)) { device_printf(dev,"Could not setup receive structures\n"); ixgbe_stop(adapter); return; } /* Configure RX settings */ ixgbe_initialize_receive_units(adapter); gpie = IXGBE_READ_REG(&adapter->hw, IXGBE_GPIE); /* Enable Fan Failure Interrupt */ gpie |= IXGBE_SDP1_GPIEN; /* Add for Module detection */ if (hw->mac.type == ixgbe_mac_82599EB) gpie |= IXGBE_SDP2_GPIEN; /* Thermal Failure Detection */ if (hw->mac.type == ixgbe_mac_X540) gpie |= IXGBE_SDP0_GPIEN; if (adapter->msix > 1) { /* Enable Enhanced MSIX mode */ gpie |= IXGBE_GPIE_MSIX_MODE; gpie |= IXGBE_GPIE_EIAME | IXGBE_GPIE_PBA_SUPPORT | IXGBE_GPIE_OCD; } IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie); /* Set MTU size */ if (ifp->if_mtu > ETHERMTU) { mhadd = IXGBE_READ_REG(hw, IXGBE_MHADD); mhadd &= ~IXGBE_MHADD_MFS_MASK; mhadd |= adapter->max_frame_size << IXGBE_MHADD_MFS_SHIFT; IXGBE_WRITE_REG(hw, IXGBE_MHADD, mhadd); } /* Now enable all the queues */ for (int i = 0; i < adapter->num_queues; i++) { txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(i)); txdctl |= IXGBE_TXDCTL_ENABLE; /* Set WTHRESH to 8, burst writeback */ txdctl |= (8 << 16); /* * When the internal queue falls below PTHRESH (32), * start prefetching as long as there are at least * HTHRESH (1) buffers ready. The values are taken * from the Intel linux driver 3.8.21. * Prefetching enables tx line rate even with 1 queue. */ txdctl |= (32 << 0) | (1 << 8); IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(i), txdctl); } for (int i = 0; i < adapter->num_queues; i++) { rxdctl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(i)); if (hw->mac.type == ixgbe_mac_82598EB) { /* ** PTHRESH = 21 ** HTHRESH = 4 ** WTHRESH = 8 */ rxdctl &= ~0x3FFFFF; rxdctl |= 0x080420; } rxdctl |= IXGBE_RXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(i), rxdctl); for (k = 0; k < 10; k++) { if (IXGBE_READ_REG(hw, IXGBE_RXDCTL(i)) & IXGBE_RXDCTL_ENABLE) break; else msec_delay(1); } wmb(); #ifdef DEV_NETMAP /* * In netmap mode, we must preserve the buffers made * available to userspace before the if_init() * (this is true by default on the TX side, because * init makes all buffers available to userspace). * * netmap_reset() and the device specific routines * (e.g. ixgbe_setup_receive_rings()) map these * buffers at the end of the NIC ring, so here we * must set the RDT (tail) register to make sure * they are not overwritten. * * In this driver the NIC ring starts at RDH = 0, * RDT points to the last slot available for reception (?), * so RDT = num_rx_desc - 1 means the whole ring is available. */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t); } else #endif /* DEV_NETMAP */ IXGBE_WRITE_REG(hw, IXGBE_RDT(i), adapter->num_rx_desc - 1); } /* Enable Receive engine */ rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); if (hw->mac.type == ixgbe_mac_82598EB) rxctrl |= IXGBE_RXCTRL_DMBYPS; rxctrl |= IXGBE_RXCTRL_RXEN; ixgbe_enable_rx_dma(hw, rxctrl); callout_reset(&adapter->timer, hz, ixgbe_local_timer, adapter); /* Set up MSI/X routing */ if (ixgbe_enable_msix) { ixgbe_configure_ivars(adapter); /* Set up auto-mask */ if (hw->mac.type == ixgbe_mac_82598EB) IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); else { IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF); IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF); } } else { /* Simple settings for Legacy/MSI */ ixgbe_set_ivar(adapter, 0, 0, 0); ixgbe_set_ivar(adapter, 0, 0, 1); IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); } #ifdef IXGBE_FDIR /* Init Flow director */ if (hw->mac.type != ixgbe_mac_82598EB) { u32 hdrm = 32 << fdir_pballoc; hw->mac.ops.setup_rxpba(hw, 0, hdrm, PBA_STRATEGY_EQUAL); ixgbe_init_fdir_signature_82599(&adapter->hw, fdir_pballoc); } #endif /* ** Check on any SFP devices that ** need to be kick-started */ if (hw->phy.type == ixgbe_phy_none) { int err = hw->phy.ops.identify(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module type was detected.\n"); return; } } /* Set moderation on the Link interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EITR(adapter->linkvec), IXGBE_LINK_ITR); /* Config/Enable Link */ ixgbe_config_link(adapter); /* Hardware Packet Buffer & Flow Control setup */ { u32 rxpb, frame, size, tmp; frame = adapter->max_frame_size; /* Calculate High Water */ if (hw->mac.type == ixgbe_mac_X540) tmp = IXGBE_DV_X540(frame, frame); else tmp = IXGBE_DV(frame, frame); size = IXGBE_BT2KB(tmp); rxpb = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(0)) >> 10; hw->fc.high_water[0] = rxpb - size; /* Now calculate Low Water */ if (hw->mac.type == ixgbe_mac_X540) tmp = IXGBE_LOW_DV_X540(frame); else tmp = IXGBE_LOW_DV(frame); hw->fc.low_water[0] = IXGBE_BT2KB(tmp); hw->fc.requested_mode = adapter->fc; hw->fc.pause_time = IXGBE_FC_PAUSE; hw->fc.send_xon = TRUE; } /* Initialize the FC settings */ ixgbe_start_hw(hw); /* Set up VLAN support and filter */ ixgbe_setup_vlan_hw_support(adapter); /* And now turn on interrupts */ ixgbe_enable_intr(adapter); /* Now inform the stack we're ready */ ifp->if_drv_flags |= IFF_DRV_RUNNING; return; } static void ixgbe_init(void *arg) { struct adapter *adapter = arg; IXGBE_CORE_LOCK(adapter); ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); return; } /* ** ** MSIX Interrupt Handlers and Tasklets ** */ static inline void ixgbe_enable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; if (hw->mac.type == ixgbe_mac_82598EB) { mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask); } else { mask = (queue & 0xFFFFFFFF); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask); mask = (queue >> 32); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask); } } static inline void ixgbe_disable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; if (hw->mac.type == ixgbe_mac_82598EB) { mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_EIMC, mask); } else { mask = (queue & 0xFFFFFFFF); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(0), mask); mask = (queue >> 32); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(1), mask); } } static void ixgbe_handle_que(void *context, int pending) { struct ix_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; bool more; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifndef IXGBE_LEGACY_TX if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixgbe_start_locked(txr, ifp); #endif IXGBE_TX_UNLOCK(txr); } /* Reenable this interrupt */ if (que->res != NULL) ixgbe_enable_queue(adapter, que->msix); else ixgbe_enable_intr(adapter); return; } /********************************************************************* * * Legacy Interrupt Service routine * **********************************************************************/ static void ixgbe_legacy_irq(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; bool more; u32 reg_eicr; reg_eicr = IXGBE_READ_REG(hw, IXGBE_EICR); ++que->irqs; if (reg_eicr == 0) { ixgbe_enable_intr(adapter); return; } more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifdef IXGBE_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixgbe_start_locked(txr, ifp); #else if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #endif IXGBE_TX_UNLOCK(txr); /* Check for fan failure */ if ((hw->phy.media_type == ixgbe_media_type_copper) && (reg_eicr & IXGBE_EICR_GPI_SDP1)) { device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! " "REPLACE IMMEDIATELY!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EICR_GPI_SDP1); } /* Link status change */ if (reg_eicr & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->link_task); if (more) taskqueue_enqueue(que->tq, &que->que_task); else ixgbe_enable_intr(adapter); return; } /********************************************************************* * * MSIX Queue Interrupt Service routine * **********************************************************************/ void ixgbe_msix_que(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; bool more; u32 newitr = 0; /* Protect against spurious interrupts */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ixgbe_disable_queue(adapter, que->msix); ++que->irqs; more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifdef IXGBE_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(ifp->if_snd)) ixgbe_start_locked(txr, ifp); #else if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #endif IXGBE_TX_UNLOCK(txr); /* Do AIM now? */ if (ixgbe_enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) IXGBE_WRITE_REG(&adapter->hw, IXGBE_EITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); if (adapter->hw.mac.type == ixgbe_mac_82598EB) newitr |= newitr << 16; else newitr |= IXGBE_EITR_CNT_WDIS; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: if (more) taskqueue_enqueue(que->tq, &que->que_task); else ixgbe_enable_queue(adapter, que->msix); return; } static void ixgbe_msix_link(void *arg) { struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; u32 reg_eicr; ++adapter->link_irq; /* First get the cause */ reg_eicr = IXGBE_READ_REG(hw, IXGBE_EICS); /* Be sure the queue bits are not cleared */ reg_eicr &= ~IXGBE_EICR_RTX_QUEUE; /* Clear interrupt with write */ IXGBE_WRITE_REG(hw, IXGBE_EICR, reg_eicr); /* Link status change */ if (reg_eicr & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->link_task); if (adapter->hw.mac.type != ixgbe_mac_82598EB) { #ifdef IXGBE_FDIR if (reg_eicr & IXGBE_EICR_FLOW_DIR) { /* This is probably overkill :) */ if (!atomic_cmpset_int(&adapter->fdir_reinit, 0, 1)) return; /* Disable the interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EICR_FLOW_DIR); taskqueue_enqueue(adapter->tq, &adapter->fdir_task); } else #endif if (reg_eicr & IXGBE_EICR_ECC) { device_printf(adapter->dev, "\nCRITICAL: ECC ERROR!! " "Please Reboot!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_ECC); } else if (reg_eicr & IXGBE_EICR_GPI_SDP1) { /* Clear the interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1); taskqueue_enqueue(adapter->tq, &adapter->msf_task); } else if (reg_eicr & IXGBE_EICR_GPI_SDP2) { /* Clear the interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP2); taskqueue_enqueue(adapter->tq, &adapter->mod_task); } } /* Check for fan failure */ if ((hw->device_id == IXGBE_DEV_ID_82598AT) && (reg_eicr & IXGBE_EICR_GPI_SDP1)) { device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! " "REPLACE IMMEDIATELY!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1); } /* Check for over temp condition */ if ((hw->mac.type == ixgbe_mac_X540) && (reg_eicr & IXGBE_EICR_TS)) { device_printf(adapter->dev, "\nCRITICAL: OVER TEMP!! " "PHY IS SHUT DOWN!!\n"); device_printf(adapter->dev, "System shutdown required\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_TS); } IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMS, IXGBE_EIMS_OTHER); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void ixgbe_media_status(struct ifnet * ifp, struct ifmediareq * ifmr) { struct adapter *adapter = ifp->if_softc; INIT_DEBUGOUT("ixgbe_media_status: begin"); IXGBE_CORE_LOCK(adapter); ixgbe_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IXGBE_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; switch (adapter->link_speed) { case IXGBE_LINK_SPEED_100_FULL: ifmr->ifm_active |= IFM_100_TX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_SX | IFM_FDX; break; case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= adapter->optics | IFM_FDX; break; } IXGBE_CORE_UNLOCK(adapter); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int ixgbe_media_change(struct ifnet * ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("ixgbe_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.phy.autoneg_advertised = IXGBE_LINK_SPEED_100_FULL | IXGBE_LINK_SPEED_1GB_FULL | IXGBE_LINK_SPEED_10GB_FULL; break; default: device_printf(adapter->dev, "Only auto media type\n"); return (EINVAL); } return (0); } /********************************************************************* * * This routine maps the mbufs to tx descriptors, allowing the * TX engine to transmit the packets. * - return 0 on success, positive on failure * **********************************************************************/ static int ixgbe_xmit(struct tx_ring *txr, struct mbuf **m_headp) { struct adapter *adapter = txr->adapter; u32 olinfo_status = 0, cmd_type_len; int i, j, error, nsegs; int first; bool remap = TRUE; struct mbuf *m_head; bus_dma_segment_t segs[adapter->num_segs]; bus_dmamap_t map; struct ixgbe_tx_buf *txbuf; union ixgbe_adv_tx_desc *txd = NULL; m_head = *m_headp; /* Basic descriptor defines */ cmd_type_len = (IXGBE_ADVTXD_DTYP_DATA | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT); if (m_head->m_flags & M_VLANTAG) cmd_type_len |= IXGBE_ADVTXD_DCMD_VLE; /* * Important to capture the first descriptor * used because it will contain the index of * the one we tell the hardware to report back */ first = txr->next_avail_desc; txbuf = &txr->tx_buffers[first]; map = txbuf->map; /* * Map the packet for DMA. */ retry: error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (__predict_false(error)) { struct mbuf *m; switch (error) { case EFBIG: /* Try it again? - one try */ if (remap == TRUE) { remap = FALSE; m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; goto retry; } else return (error); case ENOMEM: txr->no_tx_dma_setup++; return (error); default: txr->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } /* Make certain there are enough descriptors */ if (nsegs > txr->tx_avail - 2) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); } m_head = *m_headp; /* ** Set up the appropriate offload context ** this will consume the first descriptor */ error = ixgbe_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status); if (__predict_false(error)) { if (error == ENOBUFS) *m_headp = NULL; return (error); } #ifdef IXGBE_FDIR /* Do the flow director magic */ if ((txr->atr_sample) && (!adapter->fdir_reinit)) { ++txr->atr_count; if (txr->atr_count >= atr_sample_rate) { ixgbe_atr(txr, m_head); txr->atr_count = 0; } } #endif i = txr->next_avail_desc; for (j = 0; j < nsegs; j++) { bus_size_t seglen; bus_addr_t segaddr; txbuf = &txr->tx_buffers[i]; txd = &txr->tx_base[i]; seglen = segs[j].ds_len; segaddr = htole64(segs[j].ds_addr); txd->read.buffer_addr = segaddr; txd->read.cmd_type_len = htole32(txr->txd_cmd | cmd_type_len |seglen); txd->read.olinfo_status = htole32(olinfo_status); if (++i == txr->num_desc) i = 0; } txd->read.cmd_type_len |= htole32(IXGBE_TXD_CMD_EOP | IXGBE_TXD_CMD_RS); txr->tx_avail -= nsegs; txr->next_avail_desc = i; txbuf->m_head = m_head; /* ** Here we swap the map so the last descriptor, ** which gets the completion interrupt has the ** real map, and the first descriptor gets the ** unused map from this descriptor. */ txr->tx_buffers[first].map = txbuf->map; txbuf->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); /* Set the EOP descriptor that will be marked done */ txbuf = &txr->tx_buffers[first]; txbuf->eop = txd; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * Advance the Transmit Descriptor Tail (Tdt), this tells the * hardware that this frame is available to transmit. */ ++txr->total_packets; IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), i); return (0); } static void ixgbe_set_promisc(struct adapter *adapter) { u_int32_t reg_rctl; struct ifnet *ifp = adapter->ifp; int mcnt = 0; reg_rctl = IXGBE_READ_REG(&adapter->hw, IXGBE_FCTRL); reg_rctl &= (~IXGBE_FCTRL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg_rctl &= (~IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); if (ifp->if_flags & IFF_PROMISC) { reg_rctl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); } else if (ifp->if_flags & IFF_ALLMULTI) { reg_rctl |= IXGBE_FCTRL_MPE; reg_rctl &= ~IXGBE_FCTRL_UPE; IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); } return; } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ #define IXGBE_RAR_ENTRIES 16 static void ixgbe_set_multi(struct adapter *adapter) { u32 fctrl; u8 *mta; u8 *update_ptr; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = adapter->ifp; IOCTL_DEBUGOUT("ixgbe_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(u8) * IXGBE_ETH_LENGTH_OF_ADDRESS * MAX_NUM_MULTICAST_ADDRESSES); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * IXGBE_ETH_LENGTH_OF_ADDRESS], IXGBE_ETH_LENGTH_OF_ADDRESS); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif fctrl = IXGBE_READ_REG(&adapter->hw, IXGBE_FCTRL); fctrl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); if (ifp->if_flags & IFF_PROMISC) fctrl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); else if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES || ifp->if_flags & IFF_ALLMULTI) { fctrl |= IXGBE_FCTRL_MPE; fctrl &= ~IXGBE_FCTRL_UPE; } else fctrl &= ~(IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, fctrl); if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) { update_ptr = mta; ixgbe_update_mc_addr_list(&adapter->hw, update_ptr, mcnt, ixgbe_mc_array_itr, TRUE); } return; } /* * This is an iterator function now needed by the multicast * shared code. It simply feeds the shared code routine the * addresses in the array of ixgbe_set_multi() one by one. */ static u8 * ixgbe_mc_array_itr(struct ixgbe_hw *hw, u8 **update_ptr, u32 *vmdq) { u8 *addr = *update_ptr; u8 *newptr; *vmdq = 0; newptr = addr + IXGBE_ETH_LENGTH_OF_ADDRESS; *update_ptr = newptr; return addr; } /********************************************************************* * Timer routine * * This routine checks for link status,updates statistics, * and runs the watchdog check. * **********************************************************************/ static void ixgbe_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; int hung = 0, paused = 0; mtx_assert(&adapter->core_mtx, MA_OWNED); /* Check for pluggable optics */ if (adapter->sfp_probe) if (!ixgbe_sfp_probe(adapter)) goto out; /* Nothing to do */ ixgbe_update_link_status(adapter); ixgbe_update_stats_counters(adapter); /* * If the interface has been paused * then don't do the watchdog check */ if (IXGBE_READ_REG(&adapter->hw, IXGBE_TFCS) & IXGBE_TFCS_TXOFF) paused = 1; /* ** Check the TX queues status ** - watchdog only if all queues show hung */ for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if ((txr->queue_status == IXGBE_QUEUE_HUNG) && (paused == 0)) ++hung; else if (txr->queue_status == IXGBE_QUEUE_WORKING) taskqueue_enqueue(que->tq, &txr->txq_task); } /* Only truely watchdog if all queues show hung */ if (hung == adapter->num_queues) goto watchdog; out: callout_reset(&adapter->timer, hz, ixgbe_local_timer, adapter); return; watchdog: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); device_printf(dev,"Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(txr->me)), IXGBE_READ_REG(&adapter->hw, IXGBE_TDT(txr->me))); device_printf(dev,"TX(%d) desc avail = %d," "Next TX to Clean = %d\n", txr->me, txr->tx_avail, txr->next_to_clean); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; ixgbe_init_locked(adapter); } /* ** Note: this routine updates the OS on the link state ** the real check of the hardware only happens with ** a link interrupt. */ static void ixgbe_update_link_status(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; if (adapter->link_up){ if (adapter->link_active == FALSE) { if (bootverbose) device_printf(dev,"Link is up %d Gbps %s \n", ((adapter->link_speed == 128)? 10:1), "Full Duplex"); adapter->link_active = TRUE; /* Update any Flow Control changes */ ixgbe_fc_enable(&adapter->hw); if_link_state_change(ifp, LINK_STATE_UP); } } else { /* Link down */ if (adapter->link_active == TRUE) { if (bootverbose) device_printf(dev,"Link is Down\n"); if_link_state_change(ifp, LINK_STATE_DOWN); adapter->link_active = FALSE; } } return; } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void ixgbe_stop(void *arg) { struct ifnet *ifp; struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; ifp = adapter->ifp; mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixgbe_stop: begin\n"); ixgbe_disable_intr(adapter); callout_stop(&adapter->timer); /* Let the stack know...*/ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ixgbe_reset_hw(hw); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); if (hw->mac.type == ixgbe_mac_82599EB) ixgbe_stop_mac_link_on_d3_82599(hw); /* Turn off the laser - noop with no optics */ ixgbe_disable_tx_laser(hw); /* Update the stack */ adapter->link_up = FALSE; ixgbe_update_link_status(adapter); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(&adapter->hw, 0, adapter->hw.mac.addr, 0, IXGBE_RAH_AV); return; } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void ixgbe_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; /* Save off the information about this board */ hw->vendor_id = pci_get_vendor(dev); hw->device_id = pci_get_device(dev); hw->revision_id = pci_read_config(dev, PCIR_REVID, 1); hw->subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); hw->subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* We need this here to set the num_segs below */ ixgbe_set_mac_type(hw); /* Pick up the 82599 and VF settings */ if (hw->mac.type != ixgbe_mac_82598EB) { hw->phy.smart_speed = ixgbe_smart_speed; adapter->num_segs = IXGBE_82599_SCATTER; } else adapter->num_segs = IXGBE_82598_SCATTER; return; } /********************************************************************* * * Determine optic type * **********************************************************************/ static void ixgbe_setup_optics(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; int layer; layer = ixgbe_get_supported_physical_layer(hw); if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_T) { adapter->optics = IFM_10G_T; return; } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_T) { adapter->optics = IFM_1000_T; return; } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_SX) { adapter->optics = IFM_1000_SX; return; } if (layer & (IXGBE_PHYSICAL_LAYER_10GBASE_LR | IXGBE_PHYSICAL_LAYER_10GBASE_LRM)) { adapter->optics = IFM_10G_LR; return; } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_SR) { adapter->optics = IFM_10G_SR; return; } if (layer & IXGBE_PHYSICAL_LAYER_SFP_PLUS_CU) { adapter->optics = IFM_10G_TWINAX; return; } if (layer & (IXGBE_PHYSICAL_LAYER_10GBASE_KX4 | IXGBE_PHYSICAL_LAYER_10GBASE_CX4)) { adapter->optics = IFM_10G_CX4; return; } /* If we get here just set the default */ adapter->optics = IFM_ETHER | IFM_AUTO; return; } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ static int ixgbe_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; #ifndef IXGBE_LEGACY_TX struct tx_ring *txr = adapter->tx_rings; #endif int error, rid = 0; /* MSI RID at 1 */ if (adapter->msix == 1) rid = 1; /* We allocate a single interrupt resource */ adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } /* * Try allocating a fast interrupt and the associated deferred * processing contexts. */ #ifndef IXGBE_LEGACY_TX TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); #endif TASK_INIT(&que->que_task, 0, ixgbe_handle_que, que); que->tq = taskqueue_create_fast("ixgbe_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s ixq", device_get_nameunit(adapter->dev)); /* Tasklets for Link, SFP and Multispeed Fiber */ TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif adapter->tq = taskqueue_create_fast("ixgbe_link", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s linkq", device_get_nameunit(adapter->dev)); if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_legacy_irq, que, &adapter->tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(que->tq); taskqueue_free(adapter->tq); que->tq = NULL; adapter->tq = NULL; return (error); } /* For simplicity in the handlers */ adapter->que_mask = IXGBE_EIMS_ENABLE_MASK; return (0); } /********************************************************************* * * Setup MSIX Interrupt resources and handlers * **********************************************************************/ static int ixgbe_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; int error, rid, vector = 0; int cpu_id = 0; #ifdef RSS /* * If we're doing RSS, the number of queues needs to * match the number of RSS buckets that are configured. * * + If there's more queues than RSS buckets, we'll end * up with queues that get no traffic. * * + If there's more RSS buckets than queues, we'll end * up having multiple RSS buckets map to the same queue, * so there'll be some contention. */ if (adapter->num_queues != rss_getnumbuckets()) { device_printf(dev, "%s: number of queues (%d) != number of RSS buckets (%d)" "; performance will be impacted.\n", __func__, adapter->num_queues, rss_getnumbuckets()); } #endif for (int i = 0; i < adapter->num_queues; i++, vector++, que++, txr++) { rid = vector + 1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev,"Unable to allocate" " bus resource: que interrupt [%d]\n", vector); return (ENXIO); } /* Set the handler function */ error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register QUE handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "que %d", i); #endif que->msix = vector; adapter->que_mask |= (u64)(1 << que->msix); #ifdef RSS /* * The queue ID is used as the RSS layer bucket ID. * We look up the queue ID -> RSS CPU ID and select * that. */ cpu_id = rss_getcpu(i % rss_getnumbuckets()); #else /* * Bind the msix vector, and thus the * rings to the corresponding cpu. * * This just happens to match the default RSS round-robin * bucket -> queue -> CPU allocation. */ if (adapter->num_queues > 1) cpu_id = i; #endif if (adapter->num_queues > 1) bus_bind_intr(dev, que->res, cpu_id); #ifdef RSS device_printf(dev, "Bound RSS bucket %d to CPU %d\n", i, cpu_id); #else device_printf(dev, "Bound queue %d to cpu %d\n", i, cpu_id); #endif #ifndef IXGBE_LEGACY_TX TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); #endif TASK_INIT(&que->que_task, 0, ixgbe_handle_que, que); que->tq = taskqueue_create_fast("ixgbe_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); #ifdef RSS taskqueue_start_threads_pinned(&que->tq, 1, PI_NET, cpu_id, "%s (bucket %d)", device_get_nameunit(adapter->dev), cpu_id); #else taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); #endif } /* and Link */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate" " bus resource: Link interrupt [%d]\n", rid); return (ENXIO); } /* Set the link handler function */ error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_msix_link, adapter, &adapter->tag); if (error) { adapter->res = NULL; device_printf(dev, "Failed to register LINK handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; /* Tasklets for Link, SFP and Multispeed Fiber */ TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif adapter->tq = taskqueue_create_fast("ixgbe_link", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s linkq", device_get_nameunit(adapter->dev)); return (0); } /* * Setup Either MSI/X or MSI */ static int ixgbe_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int rid, want, queues, msgs; int n_queues; /* Override by tuneable */ if (ixgbe_enable_msix == 0) goto msi; /* First try MSI/X */ msgs = pci_msix_count(dev); if (msgs == 0) goto msi; rid = PCIR_BAR(MSIX_82598_BAR); adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->msix_mem == NULL) { rid += 4; /* 82599 maps in higher BAR */ adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); } if (adapter->msix_mem == NULL) { /* May not be enabled */ device_printf(adapter->dev, "Unable to map MSIX table \n"); goto msi; } /* Figure out a reasonable auto config value */ queues = (mp_ncpus > (msgs-1)) ? (msgs-1) : mp_ncpus; #ifdef RSS /* If we're doing RSS, clamp at the number of RSS buckets */ if (queues > rss_getnumbuckets()) queues = rss_getnumbuckets(); #endif /* try more specific tunable, then global, then finally default to boot time tunable if set. */ if (device_getenv_int(dev, "num_queues", &n_queues) != 0) { device_printf(dev, "using specific tunable numqueues=%d", n_queues); } else if (TUNABLE_INT_FETCH("hw.ix.num_queues", &n_queues) != 0) { if (ixgbe_num_queues != n_queues) { device_printf(dev, "using global tunable num_queues=%d", n_queues); ixgbe_num_queues = n_queues; } } else { n_queues = ixgbe_num_queues; } if (n_queues < 0) { device_printf(dev, "tunable < 0, resetting to default"); n_queues = 0; } if (n_queues != 0) queues = n_queues; /* Set max queues to 8 when autoconfiguring */ else if ((ixgbe_num_queues == 0) && (queues > 8)) queues = 8; /* ** Want one vector (RX/TX pair) per queue ** plus an additional for Link. */ want = queues + 1; if (msgs >= want) msgs = want; else { device_printf(adapter->dev, "MSIX Configuration Problem, " "%d vectors but %d queues wanted!\n", msgs, want); goto msi; } if ((pci_alloc_msix(dev, &msgs) == 0) && (msgs == want)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", msgs); adapter->num_queues = queues; return (msgs); } /* ** If MSIX alloc failed or provided us with ** less than needed, free and fall through to MSI */ pci_release_msi(dev); msi: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, rid, adapter->msix_mem); adapter->msix_mem = NULL; } msgs = 1; if (pci_alloc_msi(dev, &msgs) == 0) { device_printf(adapter->dev,"Using an MSI interrupt\n"); return (msgs); } device_printf(adapter->dev,"Using a Legacy interrupt\n"); return (0); } static int ixgbe_allocate_pci_resources(struct adapter *adapter) { int rid; device_t dev = adapter->dev; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!(adapter->pci_mem)) { device_printf(dev,"Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); adapter->hw.hw_addr = (u8 *) &adapter->osdep.mem_bus_space_handle; /* Legacy defaults */ adapter->num_queues = 1; adapter->hw.back = &adapter->osdep; /* ** Now setup MSI or MSI/X, should ** return us the number of supported ** vectors. (Will be 1 for MSI) */ adapter->msix = ixgbe_setup_msix(adapter); return (0); } static void ixgbe_free_pci_resources(struct adapter * adapter) { struct ix_queue *que = adapter->queues; device_t dev = adapter->dev; int rid, memrid; if (adapter->hw.mac.type == ixgbe_mac_82598EB) memrid = PCIR_BAR(MSIX_82598_BAR); else memrid = PCIR_BAR(MSIX_82599_BAR); /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* ** Release all msix queue resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->linkvec) /* we are doing MSIX */ rid = adapter->linkvec + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); if (adapter->tag != NULL) { bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, memrid, adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int ixgbe_setup_interface(device_t dev, struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp; INIT_DEBUGOUT("ixgbe_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); ifp->if_init = ixgbe_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ixgbe_ioctl; ifp->if_get_counter = ixgbe_get_counter; #ifndef IXGBE_LEGACY_TX ifp->if_transmit = ixgbe_mq_start; ifp->if_qflush = ixgbe_qflush; #else ifp->if_start = ixgbe_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 2); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 2; IFQ_SET_READY(&ifp->if_snd); #endif ether_ifattach(ifp, adapter->hw.mac.addr); adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; /* * Tell the upper layer(s) we support long frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_TSO | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_LRO; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU | IFCAP_HWSTATS; ifp->if_capenable = ifp->if_capabilities; /* ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the ixgbe driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, ixgbe_media_change, ixgbe_media_status); ifmedia_add(&adapter->media, IFM_ETHER | adapter->optics, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | adapter->optics); if (hw->device_id == IXGBE_DEV_ID_82598AT) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } static void ixgbe_config_link(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 autoneg, err = 0; bool sfp, negotiate; sfp = ixgbe_is_sfp(hw); if (sfp) { if (hw->phy.multispeed_fiber) { hw->mac.ops.setup_sfp(hw); ixgbe_enable_tx_laser(hw); taskqueue_enqueue(adapter->tq, &adapter->msf_task); } else taskqueue_enqueue(adapter->tq, &adapter->mod_task); } else { if (hw->mac.ops.check_link) err = ixgbe_check_link(hw, &adapter->link_speed, &adapter->link_up, FALSE); if (err) goto out; autoneg = hw->phy.autoneg_advertised; if ((!autoneg) && (hw->mac.ops.get_link_capabilities)) err = hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiate); if (err) goto out; if (hw->mac.ops.setup_link) err = hw->mac.ops.setup_link(hw, autoneg, adapter->link_up); } out: return; } /******************************************************************** * Manage DMA'able memory. *******************************************************************/ static void ixgbe_dmamap_cb(void *arg, bus_dma_segment_t * segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs->ds_addr; return; } static int ixgbe_dma_malloc(struct adapter *adapter, bus_size_t size, struct ixgbe_dma_alloc *dma, int mapflags) { device_t dev = adapter->dev; int r; r = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &dma->dma_tag); if (r != 0) { device_printf(dev,"ixgbe_dma_malloc: bus_dma_tag_create failed; " "error %u\n", r); goto fail_0; } r = bus_dmamem_alloc(dma->dma_tag, (void **)&dma->dma_vaddr, BUS_DMA_NOWAIT, &dma->dma_map); if (r != 0) { device_printf(dev,"ixgbe_dma_malloc: bus_dmamem_alloc failed; " "error %u\n", r); goto fail_1; } r = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, ixgbe_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (r != 0) { device_printf(dev,"ixgbe_dma_malloc: bus_dmamap_load failed; " "error %u\n", r); goto fail_2; } dma->dma_size = size; return (0); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); fail_1: bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_tag = NULL; return (r); } static void ixgbe_dma_free(struct adapter *adapter, struct ixgbe_dma_alloc *dma) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); } /********************************************************************* * * Allocate memory for the transmit and receive rings, and then * the descriptors associated with each, called only once at attach. * **********************************************************************/ static int ixgbe_allocate_queues(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que; struct tx_ring *txr; struct rx_ring *rxr; int rsize, tsize, error = IXGBE_SUCCESS; int txconf = 0, rxconf = 0; /* First allocate the top level queue structs */ if (!(adapter->queues = (struct ix_queue *) malloc(sizeof(struct ix_queue) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate queue memory\n"); error = ENOMEM; goto fail; } /* First allocate the TX ring struct memory */ if (!(adapter->tx_rings = (struct tx_ring *) malloc(sizeof(struct tx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); error = ENOMEM; goto tx_fail; } /* Next allocate the RX */ if (!(adapter->rx_rings = (struct rx_ring *) malloc(sizeof(struct rx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); error = ENOMEM; goto rx_fail; } /* For the ring itself */ tsize = roundup2(adapter->num_tx_desc * sizeof(union ixgbe_adv_tx_desc), DBA_ALIGN); /* * Now set up the TX queues, txconf is needed to handle the * possibility that things fail midcourse and we need to * undo memory gracefully */ for (int i = 0; i < adapter->num_queues; i++, txconf++) { /* Set up some basics */ txr = &adapter->tx_rings[i]; txr->adapter = adapter; txr->me = i; txr->num_desc = adapter->num_tx_desc; /* Initialize the TX side lock */ snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF); if (ixgbe_dma_malloc(adapter, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate TX Descriptor memory\n"); error = ENOMEM; goto err_tx_desc; } txr->tx_base = (union ixgbe_adv_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); /* Now allocate transmit buffers for the ring */ if (ixgbe_allocate_transmit_buffers(txr)) { device_printf(dev, "Critical Failure setting up transmit buffers\n"); error = ENOMEM; goto err_tx_desc; } #ifndef IXGBE_LEGACY_TX /* Allocate a buf ring */ txr->br = buf_ring_alloc(IXGBE_BR_SIZE, M_DEVBUF, M_WAITOK, &txr->tx_mtx); if (txr->br == NULL) { device_printf(dev, "Critical Failure setting up buf ring\n"); error = ENOMEM; goto err_tx_desc; } #endif } /* * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; /* Set up some basics */ rxr->adapter = adapter; rxr->me = i; rxr->num_desc = adapter->num_rx_desc; /* Initialize the RX side lock */ snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)", device_get_nameunit(dev), rxr->me); mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF); if (ixgbe_dma_malloc(adapter, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate RxDescriptor memory\n"); error = ENOMEM; goto err_rx_desc; } rxr->rx_base = (union ixgbe_adv_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ if (ixgbe_allocate_receive_buffers(rxr)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); error = ENOMEM; goto err_rx_desc; } } /* ** Finally set up the queue holding structs */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; que->adapter = adapter; que->txr = &adapter->tx_rings[i]; que->rxr = &adapter->rx_rings[i]; } return (0); err_rx_desc: for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--) ixgbe_dma_free(adapter, &rxr->rxdma); err_tx_desc: for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--) ixgbe_dma_free(adapter, &txr->txdma); free(adapter->rx_rings, M_DEVBUF); rx_fail: free(adapter->tx_rings, M_DEVBUF); tx_fail: free(adapter->queues, M_DEVBUF); fail: return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int ixgbe_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; struct ixgbe_tx_buf *txbuf; int error, i; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create( bus_get_dma_tag(adapter->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IXGBE_TSO_SIZE, /* maxsize */ adapter->num_segs, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->txtag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } if (!(txr->tx_buffers = (struct ixgbe_tx_buf *) malloc(sizeof(struct ixgbe_tx_buf) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { error = bus_dmamap_create(txr->txtag, 0, &txbuf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } return 0; fail: /* We free all, it handles case where we are in the middle */ ixgbe_free_transmit_structures(adapter); return (error); } /********************************************************************* * * Initialize a transmit ring. * **********************************************************************/ static void ixgbe_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ixgbe_tx_buf *txbuf; int i; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ /* Clear the old ring contents */ IXGBE_TX_LOCK(txr); #ifdef DEV_NETMAP /* * (under lock): if in netmap mode, do some consistency * checks and set slot to entry 0 of the netmap ring. */ slot = netmap_reset(na, NR_TX, txr->me, 0); #endif /* DEV_NETMAP */ bzero((void *)txr->tx_base, (sizeof(union ixgbe_adv_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ txr->next_avail_desc = 0; txr->next_to_clean = 0; /* Free any existing tx buffers. */ txbuf = txr->tx_buffers; for (i = 0; i < txr->num_desc; i++, txbuf++) { if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; } #ifdef DEV_NETMAP /* * In netmap mode, set the map for the packet buffer. * NOTE: Some drivers (not this one) also need to set * the physical buffer address in the NIC ring. * Slots in the netmap ring (indexed by "si") are * kring->nkr_hwofs positions "ahead" wrt the * corresponding slot in the NIC ring. In some drivers * (not here) nkr_hwofs can be negative. Function * netmap_idx_n2k() handles wraparounds properly. */ if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si)); } #endif /* DEV_NETMAP */ /* Clear the EOP descriptor pointer */ txbuf->eop = NULL; } #ifdef IXGBE_FDIR /* Set the rate at which we sample packets */ if (adapter->hw.mac.type != ixgbe_mac_82598EB) txr->atr_sample = atr_sample_rate; #endif /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IXGBE_TX_UNLOCK(txr); } /********************************************************************* * * Initialize all transmit rings. * **********************************************************************/ static int ixgbe_setup_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) ixgbe_setup_transmit_ring(txr); return (0); } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void ixgbe_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ixgbe_hw *hw = &adapter->hw; /* Setup the Base and Length of the Tx Descriptor Ring */ for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 tdba = txr->txdma.dma_paddr; u32 txctrl; IXGBE_WRITE_REG(hw, IXGBE_TDBAL(i), (tdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_TDBAH(i), (tdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_TDLEN(i), adapter->num_tx_desc * sizeof(union ixgbe_adv_tx_desc)); /* Setup the HW Tx Head and Tail descriptor pointers */ IXGBE_WRITE_REG(hw, IXGBE_TDH(i), 0); IXGBE_WRITE_REG(hw, IXGBE_TDT(i), 0); /* Setup Transmit Descriptor Cmd Settings */ txr->txd_cmd = IXGBE_TXD_CMD_IFCS; txr->queue_status = IXGBE_QUEUE_IDLE; /* Set the processing limit */ txr->process_limit = ixgbe_tx_process_limit; /* Disable Head Writeback */ switch (hw->mac.type) { case ixgbe_mac_82598EB: txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i)); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: default: txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i)); break; } txctrl &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; switch (hw->mac.type) { case ixgbe_mac_82598EB: IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), txctrl); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: default: IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), txctrl); break; } } if (hw->mac.type != ixgbe_mac_82598EB) { u32 dmatxctl, rttdcs; dmatxctl = IXGBE_READ_REG(hw, IXGBE_DMATXCTL); dmatxctl |= IXGBE_DMATXCTL_TE; IXGBE_WRITE_REG(hw, IXGBE_DMATXCTL, dmatxctl); /* Disable arbiter to set MTQC */ rttdcs = IXGBE_READ_REG(hw, IXGBE_RTTDCS); rttdcs |= IXGBE_RTTDCS_ARBDIS; IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs); IXGBE_WRITE_REG(hw, IXGBE_MTQC, IXGBE_MTQC_64Q_1PB); rttdcs &= ~IXGBE_RTTDCS_ARBDIS; IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs); } return; } /********************************************************************* * * Free all transmit rings. * **********************************************************************/ static void ixgbe_free_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) { IXGBE_TX_LOCK(txr); ixgbe_free_transmit_buffers(txr); ixgbe_dma_free(adapter, &txr->txdma); IXGBE_TX_UNLOCK(txr); IXGBE_TX_LOCK_DESTROY(txr); } free(adapter->tx_rings, M_DEVBUF); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ static void ixgbe_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ixgbe_tx_buf *tx_buffer; int i; INIT_DEBUGOUT("ixgbe_free_transmit_ring: begin"); if (txr->tx_buffers == NULL) return; tx_buffer = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) { if (tx_buffer->m_head != NULL) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; if (tx_buffer->map != NULL) { bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } else if (tx_buffer->map != NULL) { bus_dmamap_unload(txr->txtag, tx_buffer->map); bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } #ifdef IXGBE_LEGACY_TX if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); #endif if (txr->tx_buffers != NULL) { free(txr->tx_buffers, M_DEVBUF); txr->tx_buffers = NULL; } if (txr->txtag != NULL) { bus_dma_tag_destroy(txr->txtag); txr->txtag = NULL; } return; } /********************************************************************* * * Advanced Context Descriptor setup for VLAN, CSUM or TSO * **********************************************************************/ static int ixgbe_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct ixgbe_adv_tx_context_desc *TXD; struct ether_vlan_header *eh; struct ip *ip; struct ip6_hdr *ip6; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; int ehdrlen, ip_hlen = 0; u16 etype; u8 ipproto = 0; int offload = TRUE; int ctxd = txr->next_avail_desc; u16 vtag = 0; /* First check if TSO is to be used */ if (mp->m_pkthdr.csum_flags & CSUM_TSO) return (ixgbe_tso_setup(txr, mp, cmd_type_len, olinfo_status)); if ((mp->m_pkthdr.csum_flags & CSUM_OFFLOAD) == 0) offload = FALSE; /* Indicate the whole packet as payload when not doing TSO */ *olinfo_status |= mp->m_pkthdr.len << IXGBE_ADVTXD_PAYLEN_SHIFT; /* Now ready a context descriptor */ TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd]; /* ** In advanced descriptors the vlan tag must ** be placed into the context descriptor. Hence ** we need to make one even if not doing offloads. */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT); } else if (offload == FALSE) /* ... no offload to do */ return (0); /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } /* Set the ether header length */ vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT; switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; ipproto = ip->ip_p; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); /* XXX-BZ this will go badly in case of ext hdrs. */ ipproto = ip6->ip6_nxt; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; break; default: offload = FALSE; break; } vlan_macip_lens |= ip_hlen; type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; switch (ipproto) { case IPPROTO_TCP: if (mp->m_pkthdr.csum_flags & CSUM_TCP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & CSUM_UDP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP; break; #if __FreeBSD_version >= 800000 case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & CSUM_SCTP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP; break; #endif default: offload = FALSE; break; } if (offload) /* For the TX descriptor setup */ *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; /* Now copy bits into descriptor */ TXD->vlan_macip_lens = htole32(vlan_macip_lens); TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); TXD->seqnum_seed = htole32(0); TXD->mss_l4len_idx = htole32(0); /* We've consumed the first desc, adjust counters */ if (++ctxd == txr->num_desc) ctxd = 0; txr->next_avail_desc = ctxd; --txr->tx_avail; return (0); } /********************************************************************** * * Setup work for hardware segmentation offload (TSO) on * adapters using advanced tx descriptors * **********************************************************************/ static int ixgbe_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct ixgbe_adv_tx_context_desc *TXD; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; u32 mss_l4len_idx = 0, paylen; u16 vtag = 0, eh_type; int ctxd, ehdrlen, ip_hlen, tcp_hlen; struct ether_vlan_header *eh; #ifdef INET6 struct ip6_hdr *ip6; #endif #ifdef INET struct ip *ip; #endif struct tcphdr *th; /* * Determine where frame payload starts. * Jump over vlan headers if already present */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; eh_type = eh->evl_proto; } else { ehdrlen = ETHER_HDR_LEN; eh_type = eh->evl_encap_proto; } switch (ntohs(eh_type)) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); /* XXX-BZ For now we do not pretend to support ext. hdrs. */ if (ip6->ip6_nxt != IPPROTO_TCP) return (ENXIO); ip_hlen = sizeof(struct ip6_hdr); ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; break; #endif #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); if (ip->ip_p != IPPROTO_TCP) return (ENXIO); ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; /* Tell transmit desc to also do IPv4 checksum. */ *olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8; break; #endif default: panic("%s: CSUM_TSO but no supported IP version (0x%04x)", __func__, ntohs(eh_type)); break; } ctxd = txr->next_avail_desc; TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd]; tcp_hlen = th->th_off << 2; /* This is used in the transmit desc in encap */ paylen = mp->m_pkthdr.len - ehdrlen - ip_hlen - tcp_hlen; /* VLAN MACLEN IPLEN */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT); } vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= ip_hlen; TXD->vlan_macip_lens = htole32(vlan_macip_lens); /* ADV DTYPE TUCMD */ type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); /* MSS L4LEN IDX */ mss_l4len_idx |= (mp->m_pkthdr.tso_segsz << IXGBE_ADVTXD_MSS_SHIFT); mss_l4len_idx |= (tcp_hlen << IXGBE_ADVTXD_L4LEN_SHIFT); TXD->mss_l4len_idx = htole32(mss_l4len_idx); TXD->seqnum_seed = htole32(0); if (++ctxd == txr->num_desc) ctxd = 0; txr->tx_avail--; txr->next_avail_desc = ctxd; *cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; *olinfo_status |= paylen << IXGBE_ADVTXD_PAYLEN_SHIFT; ++txr->tso_tx; return (0); } #ifdef IXGBE_FDIR /* ** This routine parses packet headers so that Flow ** Director can make a hashed filter table entry ** allowing traffic flows to be identified and kept ** on the same cpu. This would be a performance ** hit, but we only do it at IXGBE_FDIR_RATE of ** packets. */ static void ixgbe_atr(struct tx_ring *txr, struct mbuf *mp) { struct adapter *adapter = txr->adapter; struct ix_queue *que; struct ip *ip; struct tcphdr *th; struct udphdr *uh; struct ether_vlan_header *eh; union ixgbe_atr_hash_dword input = {.dword = 0}; union ixgbe_atr_hash_dword common = {.dword = 0}; int ehdrlen, ip_hlen; u16 etype; eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = eh->evl_proto; } else { ehdrlen = ETHER_HDR_LEN; etype = eh->evl_encap_proto; } /* Only handling IPv4 */ if (etype != htons(ETHERTYPE_IP)) return; ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; /* check if we're UDP or TCP */ switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)((caddr_t)ip + ip_hlen); /* src and dst are inverted */ common.port.dst ^= th->th_sport; common.port.src ^= th->th_dport; input.formatted.flow_type ^= IXGBE_ATR_FLOW_TYPE_TCPV4; break; case IPPROTO_UDP: uh = (struct udphdr *)((caddr_t)ip + ip_hlen); /* src and dst are inverted */ common.port.dst ^= uh->uh_sport; common.port.src ^= uh->uh_dport; input.formatted.flow_type ^= IXGBE_ATR_FLOW_TYPE_UDPV4; break; default: return; } input.formatted.vlan_id = htobe16(mp->m_pkthdr.ether_vtag); if (mp->m_pkthdr.ether_vtag) common.flex_bytes ^= htons(ETHERTYPE_VLAN); else common.flex_bytes ^= etype; common.ip ^= ip->ip_src.s_addr ^ ip->ip_dst.s_addr; que = &adapter->queues[txr->me]; /* ** This assumes the Rx queue and Tx ** queue are bound to the same CPU */ ixgbe_fdir_add_signature_filter_82599(&adapter->hw, input, common, que->msix); } #endif /* IXGBE_FDIR */ /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * **********************************************************************/ static void ixgbe_txeof(struct tx_ring *txr) { #ifdef DEV_NETMAP struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; #endif u32 work, processed = 0; u16 limit = txr->process_limit; struct ixgbe_tx_buf *buf; union ixgbe_adv_tx_desc *txd; mtx_assert(&txr->tx_mtx, MA_OWNED); #ifdef DEV_NETMAP if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = &na->tx_rings[txr->me]; txd = txr->tx_base; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* * In netmap mode, all the work is done in the context * of the client thread. Interrupt handlers only wake up * clients, which may be sleeping on individual rings * or on a global resource for all rings. * To implement tx interrupt mitigation, we wake up the client * thread roughly every half ring, even if the NIC interrupts * more frequently. This is implemented as follows: * - ixgbe_txsync() sets kring->nr_kflags with the index of * the slot that should wake up the thread (nkr_num_slots * means the user thread should not be woken up); * - the driver ignores tx interrupts unless netmap_mitigate=0 * or the slot has the DD bit set. */ if (!netmap_mitigate || (kring->nr_kflags < kring->nkr_num_slots && txd[kring->nr_kflags].wb.status & IXGBE_TXD_STAT_DD)) { netmap_tx_irq(ifp, txr->me); } return; } #endif /* DEV_NETMAP */ if (txr->tx_avail == txr->num_desc) { txr->queue_status = IXGBE_QUEUE_IDLE; return; } /* Get work starting point */ work = txr->next_to_clean; buf = &txr->tx_buffers[work]; txd = &txr->tx_base[work]; work -= txr->num_desc; /* The distance to ring end */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); do { union ixgbe_adv_tx_desc *eop= buf->eop; if (eop == NULL) /* No work */ break; if ((eop->wb.status & IXGBE_TXD_STAT_DD) == 0) break; /* I/O not complete */ if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; buf->map = NULL; } buf->eop = NULL; ++txr->tx_avail; /* We clean the range if multi segment */ while (txd != eop) { ++txd; ++buf; ++work; /* wrap the ring? */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; buf->map = NULL; } ++txr->tx_avail; buf->eop = NULL; } ++txr->packets; ++processed; txr->watchdog_time = ticks; /* Try the next packet */ ++txd; ++buf; ++work; /* reset with a wrap */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } prefetch(txd); } while (__predict_true(--limit)); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); work += txr->num_desc; txr->next_to_clean = work; /* ** Watchdog calculation, we know there's ** work outstanding or the first return ** would have been taken, so none processed ** for too long indicates a hang. */ if ((!processed) && ((ticks - txr->watchdog_time) > IXGBE_WATCHDOG)) txr->queue_status = IXGBE_QUEUE_HUNG; if (txr->tx_avail == txr->num_desc) txr->queue_status = IXGBE_QUEUE_IDLE; return; } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void ixgbe_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; bus_dma_segment_t seg[1]; struct ixgbe_rx_buf *rxbuf; struct mbuf *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_to_refresh; /* Control the loop with one beyond */ if (++j == rxr->num_desc) j = 0; while (j != limit) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->buf == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxr->mbuf_sz); if (mp == NULL) goto update; if (adapter->max_frame_size <= (MCLBYTES - ETHER_ALIGN)) m_adj(mp, ETHER_ALIGN); } else mp = rxbuf->buf; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* If we're dealing with an mbuf that was copied rather * than replaced, there's no need to go through busdma. */ if ((rxbuf->flags & IXGBE_RX_COPY) == 0) { /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, seg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: payload dmamap load" " failure - %d\n", error); m_free(mp); rxbuf->buf = NULL; goto update; } rxbuf->buf = mp; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); rxbuf->addr = rxr->rx_base[i].read.pkt_addr = htole64(seg[0].ds_addr); } else { rxr->rx_base[i].read.pkt_addr = rxbuf->addr; rxbuf->flags &= ~IXGBE_RX_COPY; } refreshed = TRUE; /* Next is precalculated */ i = j; rxr->next_to_refresh = i; if (++j == rxr->num_desc) j = 0; } update: if (refreshed) /* Update hardware tail index */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), rxr->next_to_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int ixgbe_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; struct ixgbe_rx_buf *rxbuf; int i, bsize, error; bsize = sizeof(struct ixgbe_rx_buf) * rxr->num_desc; if (!(rxr->rx_buffers = (struct ixgbe_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM16BYTES, /* maxsize */ 1, /* nsegments */ MJUM16BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX DMA tag\n"); goto fail; } for (i = 0; i < rxr->num_desc; i++, rxbuf++) { rxbuf = &rxr->rx_buffers[i]; error = bus_dmamap_create(rxr->ptag, BUS_DMA_NOWAIT, &rxbuf->pmap); if (error) { device_printf(dev, "Unable to create RX dma map\n"); goto fail; } } return (0); fail: /* Frees all, but can handle partial completion */ ixgbe_free_receive_structures(adapter); return (error); } /* ** Used to detect a descriptor that has ** been merged by Hardware RSC. */ static inline u32 ixgbe_rsc_count(union ixgbe_adv_rx_desc *rx) { return (le32toh(rx->wb.lower.lo_dword.data) & IXGBE_RXDADV_RSCCNT_MASK) >> IXGBE_RXDADV_RSCCNT_SHIFT; } /********************************************************************* * * Initialize Hardware RSC (LRO) feature on 82599 * for an RX ring, this is toggled by the LRO capability * even though it is transparent to the stack. * * NOTE: since this HW feature only works with IPV4 and * our testing has shown soft LRO to be as effective * I have decided to disable this by default. * **********************************************************************/ static void ixgbe_setup_hw_rsc(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct ixgbe_hw *hw = &adapter->hw; u32 rscctrl, rdrxctl; /* If turning LRO/RSC off we need to disable it */ if ((adapter->ifp->if_capenable & IFCAP_LRO) == 0) { rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxr->me)); rscctrl &= ~IXGBE_RSCCTL_RSCEN; return; } rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL); rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE; #ifdef DEV_NETMAP /* crcstrip is optional in netmap */ if (adapter->ifp->if_capenable & IFCAP_NETMAP && !ix_crcstrip) #endif /* DEV_NETMAP */ rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP; rdrxctl |= IXGBE_RDRXCTL_RSCACKC; IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl); rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxr->me)); rscctrl |= IXGBE_RSCCTL_RSCEN; /* ** Limit the total number of descriptors that ** can be combined, so it does not exceed 64K */ if (rxr->mbuf_sz == MCLBYTES) rscctrl |= IXGBE_RSCCTL_MAXDESC_16; else if (rxr->mbuf_sz == MJUMPAGESIZE) rscctrl |= IXGBE_RSCCTL_MAXDESC_8; else if (rxr->mbuf_sz == MJUM9BYTES) rscctrl |= IXGBE_RSCCTL_MAXDESC_4; else /* Using 16K cluster */ rscctrl |= IXGBE_RSCCTL_MAXDESC_1; IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(rxr->me), rscctrl); /* Enable TCP header recognition */ IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), (IXGBE_READ_REG(hw, IXGBE_PSRTYPE(0)) | IXGBE_PSRTYPE_TCPHDR)); /* Disable RSC for ACK packets */ IXGBE_WRITE_REG(hw, IXGBE_RSCDBU, (IXGBE_RSCDBU_RSCACKDIS | IXGBE_READ_REG(hw, IXGBE_RSCDBU))); rxr->hw_rsc = TRUE; } static void ixgbe_free_receive_ring(struct rx_ring *rxr) { struct ixgbe_rx_buf *rxbuf; int i; for (i = 0; i < rxr->num_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->buf != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->buf->m_flags |= M_PKTHDR; m_freem(rxbuf->buf); rxbuf->buf = NULL; rxbuf->flags = 0; } } } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int ixgbe_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter; struct ifnet *ifp; device_t dev; struct ixgbe_rx_buf *rxbuf; bus_dma_segment_t seg[1]; struct lro_ctrl *lro = &rxr->lro; int rsize, nsegs, error = 0; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(rxr->adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ adapter = rxr->adapter; ifp = adapter->ifp; dev = adapter->dev; /* Clear the ring contents */ IXGBE_RX_LOCK(rxr); #ifdef DEV_NETMAP /* same as in ixgbe_setup_transmit_ring() */ slot = netmap_reset(na, NR_RX, rxr->me, 0); #endif /* DEV_NETMAP */ rsize = roundup2(adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); /* Cache the size */ rxr->mbuf_sz = adapter->rx_mbuf_sz; /* Free current RX buffer structs and their mbufs */ ixgbe_free_receive_ring(rxr); /* Now replenish the mbufs */ for (int j = 0; j != rxr->num_desc; ++j) { struct mbuf *mp; rxbuf = &rxr->rx_buffers[j]; #ifdef DEV_NETMAP /* * In netmap mode, fill the map and set the buffer * address in the NIC ring, considering the offset * between the netmap and NIC rings (see comment in * ixgbe_setup_transmit_ring() ). No need to allocate * an mbuf, so end the block with a continue; */ if (slot) { int sj = netmap_idx_n2k(&na->rx_rings[rxr->me], j); uint64_t paddr; void *addr; addr = PNMB(na, slot + sj, &paddr); netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr); /* Update descriptor and the cached value */ rxr->rx_base[j].read.pkt_addr = htole64(paddr); rxbuf->addr = htole64(paddr); continue; } #endif /* DEV_NETMAP */ rxbuf->flags = 0; rxbuf->buf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->buf == NULL) { error = ENOBUFS; goto fail; } mp = rxbuf->buf; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, seg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); /* Update the descriptor and the cached value */ rxr->rx_base[j].read.pkt_addr = htole64(seg[0].ds_addr); rxbuf->addr = htole64(seg[0].ds_addr); } /* Setup our descriptor indices */ rxr->next_to_check = 0; rxr->next_to_refresh = 0; rxr->lro_enabled = FALSE; rxr->rx_copies = 0; rxr->rx_bytes = 0; rxr->vtag_strip = FALSE; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* ** Now set up the LRO interface: */ if (ixgbe_rsc_enable) ixgbe_setup_hw_rsc(rxr); else if (ifp->if_capenable & IFCAP_LRO) { int err = tcp_lro_init(lro); if (err) { device_printf(dev, "LRO Initialization failed!\n"); goto fail; } INIT_DEBUGOUT("RX Soft LRO Initialized\n"); rxr->lro_enabled = TRUE; lro->ifp = adapter->ifp; } IXGBE_RX_UNLOCK(rxr); return (0); fail: ixgbe_free_receive_ring(rxr); IXGBE_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int ixgbe_setup_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; int j; for (j = 0; j < adapter->num_queues; j++, rxr++) if (ixgbe_setup_receive_ring(rxr)) goto fail; return (0); fail: /* * Free RX buffers allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'j' failed, so its the terminus. */ for (int i = 0; i < j; ++i) { rxr = &adapter->rx_rings[i]; ixgbe_free_receive_ring(rxr); } return (ENOBUFS); } static void ixgbe_initialise_rss_mapping(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; uint32_t reta; int i, j, queue_id; uint32_t rss_key[10]; uint32_t mrqc; #ifdef RSS uint32_t rss_hash_config; #endif /* Setup RSS */ reta = 0; #ifdef RSS /* Fetch the configured RSS key */ rss_getkey((uint8_t *) &rss_key); #else /* set up random bits */ arc4rand(&rss_key, sizeof(rss_key), 0); #endif /* Set up the redirection table */ for (i = 0, j = 0; i < 128; i++, j++) { if (j == adapter->num_queues) j = 0; #ifdef RSS /* * Fetch the RSS bucket id for the given indirection entry. * Cap it at the number of configured buckets (which is * num_queues.) */ queue_id = rss_get_indirection_to_bucket(i); queue_id = queue_id % adapter->num_queues; #else queue_id = (j * 0x11); #endif /* * The low 8 bits are for hash value (n+0); * The next 8 bits are for hash value (n+1), etc. */ reta = reta >> 8; reta = reta | ( ((uint32_t) queue_id) << 24); if ((i & 3) == 3) { IXGBE_WRITE_REG(hw, IXGBE_RETA(i >> 2), reta); reta = 0; } } /* Now fill our hash function seeds */ for (int i = 0; i < 10; i++) IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), rss_key[i]); /* Perform hash on these packet types */ #ifdef RSS mrqc = IXGBE_MRQC_RSSEN; rss_hash_config = rss_gethashconfig(); if (rss_hash_config & RSS_HASHTYPE_RSS_IPV4) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4; if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV4) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4_TCP; if (rss_hash_config & RSS_HASHTYPE_RSS_IPV6) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6; if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV6) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_TCP; if (rss_hash_config & RSS_HASHTYPE_RSS_IPV6_EX) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX; if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV6_EX) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX_TCP; if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV4) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4_UDP; if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV4_EX) device_printf(adapter->dev, "%s: RSS_HASHTYPE_RSS_UDP_IPV4_EX defined, " "but not supported\n", __func__); if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV6) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_UDP; if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV6_EX) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX_UDP; #else /* * Disable UDP - IP fragments aren't currently being handled * and so we end up with a mix of 2-tuple and 4-tuple * traffic. */ mrqc = IXGBE_MRQC_RSSEN | IXGBE_MRQC_RSS_FIELD_IPV4 | IXGBE_MRQC_RSS_FIELD_IPV4_TCP #if 0 | IXGBE_MRQC_RSS_FIELD_IPV4_UDP #endif | IXGBE_MRQC_RSS_FIELD_IPV6_EX_TCP | IXGBE_MRQC_RSS_FIELD_IPV6_EX | IXGBE_MRQC_RSS_FIELD_IPV6 | IXGBE_MRQC_RSS_FIELD_IPV6_TCP #if 0 | IXGBE_MRQC_RSS_FIELD_IPV6_UDP | IXGBE_MRQC_RSS_FIELD_IPV6_EX_UDP #endif ; #endif /* RSS */ IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc); } /********************************************************************* * * Setup receive registers and features. * **********************************************************************/ #define IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT 2 #define BSIZEPKT_ROUNDUP ((1<rx_rings; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 bufsz, rxctrl, fctrl, srrctl, rxcsum; u32 hlreg; /* * Make sure receives are disabled while * setting up the descriptor ring */ rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, rxctrl & ~IXGBE_RXCTRL_RXEN); /* Enable broadcasts */ fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL); fctrl |= IXGBE_FCTRL_BAM; fctrl |= IXGBE_FCTRL_DPF; fctrl |= IXGBE_FCTRL_PMCF; IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl); /* Set for Jumbo Frames? */ hlreg = IXGBE_READ_REG(hw, IXGBE_HLREG0); if (ifp->if_mtu > ETHERMTU) hlreg |= IXGBE_HLREG0_JUMBOEN; else hlreg &= ~IXGBE_HLREG0_JUMBOEN; #ifdef DEV_NETMAP /* crcstrip is conditional in netmap (in RDRXCTL too ?) */ if (ifp->if_capenable & IFCAP_NETMAP && !ix_crcstrip) hlreg &= ~IXGBE_HLREG0_RXCRCSTRP; else hlreg |= IXGBE_HLREG0_RXCRCSTRP; #endif /* DEV_NETMAP */ IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg); bufsz = (adapter->rx_mbuf_sz + BSIZEPKT_ROUNDUP) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 rdba = rxr->rxdma.dma_paddr; /* Setup the Base and Length of the Rx Descriptor Ring */ IXGBE_WRITE_REG(hw, IXGBE_RDBAL(i), (rdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_RDBAH(i), (rdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_RDLEN(i), adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc)); /* Set up the SRRCTL register */ srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(i)); srrctl &= ~IXGBE_SRRCTL_BSIZEHDR_MASK; srrctl &= ~IXGBE_SRRCTL_BSIZEPKT_MASK; srrctl |= bufsz; srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; /* * Set DROP_EN iff we have no flow control and >1 queue. * Note that srrctl was cleared shortly before during reset, * so we do not need to clear the bit, but do it just in case * this code is moved elsewhere. */ if (adapter->num_queues > 1 && adapter->fc == ixgbe_fc_none) { srrctl |= IXGBE_SRRCTL_DROP_EN; } else { srrctl &= ~IXGBE_SRRCTL_DROP_EN; } IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(i), srrctl); /* Setup the HW Rx Head and Tail Descriptor Pointers */ IXGBE_WRITE_REG(hw, IXGBE_RDH(i), 0); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), 0); /* Set the processing limit */ rxr->process_limit = ixgbe_rx_process_limit; } if (adapter->hw.mac.type != ixgbe_mac_82598EB) { u32 psrtype = IXGBE_PSRTYPE_TCPHDR | IXGBE_PSRTYPE_UDPHDR | IXGBE_PSRTYPE_IPV4HDR | IXGBE_PSRTYPE_IPV6HDR; IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), psrtype); } rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM); ixgbe_initialise_rss_mapping(adapter); if (adapter->num_queues > 1) { /* RSS and RX IPP Checksum are mutually exclusive */ rxcsum |= IXGBE_RXCSUM_PCSD; } if (ifp->if_capenable & IFCAP_RXCSUM) rxcsum |= IXGBE_RXCSUM_PCSD; if (!(rxcsum & IXGBE_RXCSUM_PCSD)) rxcsum |= IXGBE_RXCSUM_IPPCSE; IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum); return; } /********************************************************************* * * Free all receive rings. * **********************************************************************/ static void ixgbe_free_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; INIT_DEBUGOUT("ixgbe_free_receive_structures: begin"); for (int i = 0; i < adapter->num_queues; i++, rxr++) { struct lro_ctrl *lro = &rxr->lro; ixgbe_free_receive_buffers(rxr); /* Free LRO memory */ tcp_lro_free(lro); /* Free the ring memory as well */ ixgbe_dma_free(adapter, &rxr->rxdma); } free(adapter->rx_rings, M_DEVBUF); } /********************************************************************* * * Free receive ring data structures * **********************************************************************/ static void ixgbe_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct ixgbe_rx_buf *rxbuf; INIT_DEBUGOUT("ixgbe_free_receive_buffers: begin"); /* Cleanup any existing buffers */ if (rxr->rx_buffers != NULL) { for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->buf != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->buf->m_flags |= M_PKTHDR; m_freem(rxbuf->buf); } rxbuf->buf = NULL; if (rxbuf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, rxbuf->pmap); rxbuf->pmap = NULL; } } if (rxr->rx_buffers != NULL) { free(rxr->rx_buffers, M_DEVBUF); rxr->rx_buffers = NULL; } } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } return; } static __inline void ixgbe_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u32 ptype) { /* * ATM LRO is only for IP/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. In case of IPv6 we do not yet support ext. hdrs. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 && ((ptype & (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP)) == (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP) || (ptype & (IXGBE_RXDADV_PKTTYPE_IPV6 | IXGBE_RXDADV_PKTTYPE_TCP)) == (IXGBE_RXDADV_PKTTYPE_IPV6 | IXGBE_RXDADV_PKTTYPE_TCP)) && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } IXGBE_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IXGBE_RX_LOCK(rxr); } static __inline void ixgbe_rx_discard(struct rx_ring *rxr, int i) { struct ixgbe_rx_buf *rbuf; rbuf = &rxr->rx_buffers[i]; /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->fmp != NULL) {/* Partial chain ? */ rbuf->fmp->m_flags |= M_PKTHDR; m_freem(rbuf->fmp); rbuf->fmp = NULL; rbuf->buf = NULL; /* rbuf->buf is part of fmp's chain */ } else if (rbuf->buf) { m_free(rbuf->buf); rbuf->buf = NULL; } rbuf->flags = 0; return; } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE for more work, FALSE for all clean. *********************************************************************/ static bool ixgbe_rxeof(struct ix_queue *que) { struct adapter *adapter = que->adapter; struct rx_ring *rxr = que->rxr; struct ifnet *ifp = adapter->ifp; struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; int i, nextp, processed = 0; u32 staterr = 0; u16 count = rxr->process_limit; union ixgbe_adv_rx_desc *cur; struct ixgbe_rx_buf *rbuf, *nbuf; u16 pkt_info; IXGBE_RX_LOCK(rxr); #ifdef DEV_NETMAP /* Same as the txeof routine: wakeup clients on intr. */ if (netmap_rx_irq(ifp, rxr->me, &processed)) { IXGBE_RX_UNLOCK(rxr); return (FALSE); } #endif /* DEV_NETMAP */ for (i = rxr->next_to_check; count != 0;) { struct mbuf *sendmp, *mp; u32 rsc, ptype; u16 len; u16 vtag = 0; bool eop; /* Sync the ring. */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); cur = &rxr->rx_base[i]; staterr = le32toh(cur->wb.upper.status_error); pkt_info = le16toh(cur->wb.lower.lo_dword.hs_rss.pkt_info); if ((staterr & IXGBE_RXD_STAT_DD) == 0) break; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = NULL; nbuf = NULL; rsc = 0; cur->wb.upper.status_error = 0; rbuf = &rxr->rx_buffers[i]; mp = rbuf->buf; len = le16toh(cur->wb.upper.length); ptype = le32toh(cur->wb.lower.lo_dword.data) & IXGBE_RXDADV_PKTTYPE_MASK; eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0); /* Make sure bad packets are discarded */ if (eop && (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) != 0) { rxr->rx_discarded++; ixgbe_rx_discard(rxr, i); goto next_desc; } /* ** On 82599 which supports a hardware ** LRO (called HW RSC), packets need ** not be fragmented across sequential ** descriptors, rather the next descriptor ** is indicated in bits of the descriptor. ** This also means that we might proceses ** more than one packet at a time, something ** that has never been true before, it ** required eliminating global chain pointers ** in favor of what we are doing here. -jfv */ if (!eop) { /* ** Figure out the next descriptor ** of this frame. */ if (rxr->hw_rsc == TRUE) { rsc = ixgbe_rsc_count(cur); rxr->rsc_num += (rsc - 1); } if (rsc) { /* Get hardware index */ nextp = ((staterr & IXGBE_RXDADV_NEXTP_MASK) >> IXGBE_RXDADV_NEXTP_SHIFT); } else { /* Just sequential */ nextp = i + 1; if (nextp == adapter->num_rx_desc) nextp = 0; } nbuf = &rxr->rx_buffers[nextp]; prefetch(nbuf); } /* ** Rather than using the fmp/lmp global pointers ** we now keep the head of a packet chain in the ** buffer struct and pass this along from one ** descriptor to the next, until we get EOP. */ mp->m_len = len; /* ** See if there is a stored head ** that determines what we are */ sendmp = rbuf->fmp; if (sendmp != NULL) { /* secondary frag */ rbuf->buf = rbuf->fmp = NULL; mp->m_flags &= ~M_PKTHDR; sendmp->m_pkthdr.len += mp->m_len; } else { /* * Optimize. This might be a small packet, * maybe just a TCP ACK. Do a fast copy that * is cache aligned into a new mbuf, and * leave the old mbuf+cluster for re-use. */ if (eop && len <= IXGBE_RX_COPY_LEN) { sendmp = m_gethdr(M_NOWAIT, MT_DATA); if (sendmp != NULL) { sendmp->m_data += IXGBE_RX_COPY_ALIGN; ixgbe_bcopy(mp->m_data, sendmp->m_data, len); sendmp->m_len = len; rxr->rx_copies++; rbuf->flags |= IXGBE_RX_COPY; } } if (sendmp == NULL) { rbuf->buf = rbuf->fmp = NULL; sendmp = mp; } /* first desc of a non-ps chain */ sendmp->m_flags |= M_PKTHDR; sendmp->m_pkthdr.len = mp->m_len; } ++processed; /* Pass the head pointer on */ if (eop == 0) { nbuf->fmp = sendmp; sendmp = NULL; mp->m_next = nbuf->buf; } else { /* Sending this frame */ sendmp->m_pkthdr.rcvif = ifp; rxr->rx_packets++; /* capture data for AIM */ rxr->bytes += sendmp->m_pkthdr.len; rxr->rx_bytes += sendmp->m_pkthdr.len; /* Process vlan info */ if ((rxr->vtag_strip) && (staterr & IXGBE_RXD_STAT_VP)) vtag = le16toh(cur->wb.upper.vlan); if (vtag) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) ixgbe_rx_checksum(staterr, sendmp, ptype); #if __FreeBSD_version >= 800000 #ifdef RSS sendmp->m_pkthdr.flowid = le32toh(cur->wb.lower.hi_dword.rss); - sendmp->m_flags |= M_FLOWID; switch (pkt_info & IXGBE_RXDADV_RSSTYPE_MASK) { case IXGBE_RXDADV_RSSTYPE_IPV4_TCP: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_TCP_IPV4); break; case IXGBE_RXDADV_RSSTYPE_IPV4: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_IPV4); break; case IXGBE_RXDADV_RSSTYPE_IPV6_TCP: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_TCP_IPV6); break; case IXGBE_RXDADV_RSSTYPE_IPV6_EX: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_IPV6_EX); break; case IXGBE_RXDADV_RSSTYPE_IPV6: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_IPV6); break; case IXGBE_RXDADV_RSSTYPE_IPV6_TCP_EX: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_TCP_IPV6_EX); break; case IXGBE_RXDADV_RSSTYPE_IPV4_UDP: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_UDP_IPV4); break; case IXGBE_RXDADV_RSSTYPE_IPV6_UDP: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_UDP_IPV6); break; case IXGBE_RXDADV_RSSTYPE_IPV6_UDP_EX: M_HASHTYPE_SET(sendmp, M_HASHTYPE_RSS_UDP_IPV6_EX); break; default: /* XXX fallthrough */ - M_HASHTYPE_SET(sendmp, M_HASHTYPE_NONE); + M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE); + break; } #else /* RSS */ sendmp->m_pkthdr.flowid = que->msix; - sendmp->m_flags |= M_FLOWID; + M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE); #endif /* RSS */ #endif /* FreeBSD_version */ } next_desc: bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == rxr->num_desc) i = 0; /* Now send to the stack or do LRO */ if (sendmp != NULL) { rxr->next_to_check = i; ixgbe_rx_input(rxr, ifp, sendmp, ptype); i = rxr->next_to_check; } /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { ixgbe_refresh_mbufs(rxr, i); processed = 0; } } /* Refresh any remaining buf structs */ if (ixgbe_rx_unrefreshed(rxr)) ixgbe_refresh_mbufs(rxr, i); rxr->next_to_check = i; /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } IXGBE_RX_UNLOCK(rxr); /* ** Still have cleaning to do? */ if ((staterr & IXGBE_RXD_STAT_DD) != 0) return (TRUE); else return (FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void ixgbe_rx_checksum(u32 staterr, struct mbuf * mp, u32 ptype) { u16 status = (u16) staterr; u8 errors = (u8) (staterr >> 24); bool sctp = FALSE; if ((ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & IXGBE_RXDADV_PKTTYPE_SCTP) != 0) sctp = TRUE; if (status & IXGBE_RXD_STAT_IPCS) { if (!(errors & IXGBE_RXD_ERR_IPE)) { /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } else mp->m_pkthdr.csum_flags = 0; } if (status & IXGBE_RXD_STAT_L4CS) { u64 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); #if __FreeBSD_version >= 800000 if (sctp) type = CSUM_SCTP_VALID; #endif if (!(errors & IXGBE_RXD_ERR_TCPE)) { mp->m_pkthdr.csum_flags |= type; if (!sctp) mp->m_pkthdr.csum_data = htons(0xffff); } } return; } /* ** This routine is run via an vlan config EVENT, ** it enables us to use the HW Filter table since ** we can get the vlan id. This just creates the ** entry in the soft version of the VFTA, init will ** repopulate the real table. */ static void ixgbe_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; ixgbe_setup_vlan_hw_support(adapter); IXGBE_CORE_UNLOCK(adapter); } /* ** This routine is run via an vlan ** unconfig EVENT, remove our entry ** in the soft vfta. */ static void ixgbe_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Re-init to load the changes */ ixgbe_setup_vlan_hw_support(adapter); IXGBE_CORE_UNLOCK(adapter); } static void ixgbe_setup_vlan_hw_support(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ixgbe_hw *hw = &adapter->hw; struct rx_ring *rxr; u32 ctrl; /* ** We get here thru init_locked, meaning ** a soft reset, this has already cleared ** the VFTA and other state, so if there ** have been no vlan's registered do nothing. */ if (adapter->num_vlans == 0) return; /* Setup the queues for vlans */ for (int i = 0; i < adapter->num_queues; i++) { rxr = &adapter->rx_rings[i]; /* On 82599 the VLAN enable is per/queue in RXDCTL */ if (hw->mac.type != ixgbe_mac_82598EB) { ctrl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(i)); ctrl |= IXGBE_RXDCTL_VME; IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(i), ctrl); } rxr->vtag_strip = TRUE; } if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IXGBE_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) IXGBE_WRITE_REG(hw, IXGBE_VFTA(i), adapter->shadow_vfta[i]); ctrl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL); /* Enable the Filter Table if enabled */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { ctrl &= ~IXGBE_VLNCTRL_CFIEN; ctrl |= IXGBE_VLNCTRL_VFE; } if (hw->mac.type == ixgbe_mac_82598EB) ctrl |= IXGBE_VLNCTRL_VME; IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, ctrl); } static void ixgbe_enable_intr(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ix_queue *que = adapter->queues; u32 mask, fwsm; mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE); /* Enable Fan Failure detection */ if (hw->device_id == IXGBE_DEV_ID_82598AT) mask |= IXGBE_EIMS_GPI_SDP1; switch (adapter->hw.mac.type) { case ixgbe_mac_82599EB: mask |= IXGBE_EIMS_ECC; mask |= IXGBE_EIMS_GPI_SDP0; mask |= IXGBE_EIMS_GPI_SDP1; mask |= IXGBE_EIMS_GPI_SDP2; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif break; case ixgbe_mac_X540: mask |= IXGBE_EIMS_ECC; /* Detect if Thermal Sensor is enabled */ fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM); if (fwsm & IXGBE_FWSM_TS_ENABLED) mask |= IXGBE_EIMS_TS; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif /* falls through */ default: break; } IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask); /* With RSS we use auto clear */ if (adapter->msix_mem) { mask = IXGBE_EIMS_ENABLE_MASK; /* Don't autoclear Link */ mask &= ~IXGBE_EIMS_OTHER; mask &= ~IXGBE_EIMS_LSC; IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask); } /* ** Now enable all queues, this is done separately to ** allow for handling the extended (beyond 32) MSIX ** vectors that can be used by 82599 */ for (int i = 0; i < adapter->num_queues; i++, que++) ixgbe_enable_queue(adapter, que->msix); IXGBE_WRITE_FLUSH(hw); return; } static void ixgbe_disable_intr(struct adapter *adapter) { if (adapter->msix_mem) IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIAC, 0); if (adapter->hw.mac.type == ixgbe_mac_82598EB) { IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, ~0); } else { IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, 0xFFFF0000); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(0), ~0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(1), ~0); } IXGBE_WRITE_FLUSH(&adapter->hw); return; } u16 ixgbe_read_pci_cfg(struct ixgbe_hw *hw, u32 reg) { u16 value; value = pci_read_config(((struct ixgbe_osdep *)hw->back)->dev, reg, 2); return (value); } void ixgbe_write_pci_cfg(struct ixgbe_hw *hw, u32 reg, u16 value) { pci_write_config(((struct ixgbe_osdep *)hw->back)->dev, reg, value, 2); return; } /* ** Get the width and transaction speed of ** the slot this adapter is plugged into. */ static void ixgbe_get_slot_info(struct ixgbe_hw *hw) { device_t dev = ((struct ixgbe_osdep *)hw->back)->dev; struct ixgbe_mac_info *mac = &hw->mac; u16 link; u32 offset; /* For most devices simply call the shared code routine */ if (hw->device_id != IXGBE_DEV_ID_82599_SFP_SF_QP) { ixgbe_get_bus_info(hw); goto display; } /* ** For the Quad port adapter we need to parse back ** up the PCI tree to find the speed of the expansion ** slot into which this adapter is plugged. A bit more work. */ dev = device_get_parent(device_get_parent(dev)); #ifdef IXGBE_DEBUG device_printf(dev, "parent pcib = %x,%x,%x\n", pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev)); #endif dev = device_get_parent(device_get_parent(dev)); #ifdef IXGBE_DEBUG device_printf(dev, "slot pcib = %x,%x,%x\n", pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev)); #endif /* Now get the PCI Express Capabilities offset */ pci_find_cap(dev, PCIY_EXPRESS, &offset); /* ...and read the Link Status Register */ link = pci_read_config(dev, offset + PCIER_LINK_STA, 2); switch (link & IXGBE_PCI_LINK_WIDTH) { case IXGBE_PCI_LINK_WIDTH_1: hw->bus.width = ixgbe_bus_width_pcie_x1; break; case IXGBE_PCI_LINK_WIDTH_2: hw->bus.width = ixgbe_bus_width_pcie_x2; break; case IXGBE_PCI_LINK_WIDTH_4: hw->bus.width = ixgbe_bus_width_pcie_x4; break; case IXGBE_PCI_LINK_WIDTH_8: hw->bus.width = ixgbe_bus_width_pcie_x8; break; default: hw->bus.width = ixgbe_bus_width_unknown; break; } switch (link & IXGBE_PCI_LINK_SPEED) { case IXGBE_PCI_LINK_SPEED_2500: hw->bus.speed = ixgbe_bus_speed_2500; break; case IXGBE_PCI_LINK_SPEED_5000: hw->bus.speed = ixgbe_bus_speed_5000; break; case IXGBE_PCI_LINK_SPEED_8000: hw->bus.speed = ixgbe_bus_speed_8000; break; default: hw->bus.speed = ixgbe_bus_speed_unknown; break; } mac->ops.set_lan_id(hw); display: device_printf(dev,"PCI Express Bus: Speed %s %s\n", ((hw->bus.speed == ixgbe_bus_speed_8000) ? "8.0GT/s": (hw->bus.speed == ixgbe_bus_speed_5000) ? "5.0GT/s": (hw->bus.speed == ixgbe_bus_speed_2500) ? "2.5GT/s":"Unknown"), (hw->bus.width == ixgbe_bus_width_pcie_x8) ? "Width x8" : (hw->bus.width == ixgbe_bus_width_pcie_x4) ? "Width x4" : (hw->bus.width == ixgbe_bus_width_pcie_x1) ? "Width x1" : ("Unknown")); if ((hw->device_id != IXGBE_DEV_ID_82599_SFP_SF_QP) && ((hw->bus.width <= ixgbe_bus_width_pcie_x4) && (hw->bus.speed == ixgbe_bus_speed_2500))) { device_printf(dev, "PCI-Express bandwidth available" " for this card\n is not sufficient for" " optimal performance.\n"); device_printf(dev, "For optimal performance a x8 " "PCIE, or x4 PCIE Gen2 slot is required.\n"); } if ((hw->device_id == IXGBE_DEV_ID_82599_SFP_SF_QP) && ((hw->bus.width <= ixgbe_bus_width_pcie_x8) && (hw->bus.speed < ixgbe_bus_speed_8000))) { device_printf(dev, "PCI-Express bandwidth available" " for this card\n is not sufficient for" " optimal performance.\n"); device_printf(dev, "For optimal performance a x8 " "PCIE Gen3 slot is required.\n"); } return; } /* ** Setup the correct IVAR register for a particular MSIX interrupt ** (yes this is all very magic and confusing :) ** - entry is the register array entry ** - vector is the MSIX vector for this queue ** - type is RX/TX/MISC */ static void ixgbe_set_ivar(struct adapter *adapter, u8 entry, u8 vector, s8 type) { struct ixgbe_hw *hw = &adapter->hw; u32 ivar, index; vector |= IXGBE_IVAR_ALLOC_VAL; switch (hw->mac.type) { case ixgbe_mac_82598EB: if (type == -1) entry = IXGBE_IVAR_OTHER_CAUSES_INDEX; else entry += (type * 64); index = (entry >> 2) & 0x1F; ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index)); ivar &= ~(0xFF << (8 * (entry & 0x3))); ivar |= (vector << (8 * (entry & 0x3))); IXGBE_WRITE_REG(&adapter->hw, IXGBE_IVAR(index), ivar); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: if (type == -1) { /* MISC IVAR */ index = (entry & 1) * 8; ivar = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, ivar); } else { /* RX/TX IVARS */ index = (16 * (entry & 1)) + (8 * type); ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(entry >> 1)); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_IVAR(entry >> 1), ivar); } default: break; } } static void ixgbe_configure_ivars(struct adapter *adapter) { struct ix_queue *que = adapter->queues; u32 newitr; if (ixgbe_max_interrupt_rate > 0) newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8; else newitr = 0; for (int i = 0; i < adapter->num_queues; i++, que++) { /* First the RX queue entry */ ixgbe_set_ivar(adapter, i, que->msix, 0); /* ... and the TX */ ixgbe_set_ivar(adapter, i, que->msix, 1); /* Set an Initial EITR value */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_EITR(que->msix), newitr); } /* For the Link interrupt */ ixgbe_set_ivar(adapter, 1, adapter->linkvec, -1); } /* ** ixgbe_sfp_probe - called in the local timer to ** determine if a port had optics inserted. */ static bool ixgbe_sfp_probe(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; bool result = FALSE; if ((hw->phy.type == ixgbe_phy_nl) && (hw->phy.sfp_type == ixgbe_sfp_type_not_present)) { s32 ret = hw->phy.ops.identify_sfp(hw); if (ret) goto out; ret = hw->phy.ops.reset(hw); if (ret == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev,"Unsupported SFP+ module detected!"); printf(" Reload driver with supported module.\n"); adapter->sfp_probe = FALSE; goto out; } else device_printf(dev,"SFP+ module detected!\n"); /* We now have supported optics */ adapter->sfp_probe = FALSE; /* Set the optics type so system reports correctly */ ixgbe_setup_optics(adapter); result = TRUE; } out: return (result); } /* ** Tasklet handler for MSIX Link interrupts ** - do outside interrupt since it might sleep */ static void ixgbe_handle_link(void *context, int pending) { struct adapter *adapter = context; ixgbe_check_link(&adapter->hw, &adapter->link_speed, &adapter->link_up, 0); ixgbe_update_link_status(adapter); } /* ** Tasklet for handling SFP module interrupts */ static void ixgbe_handle_mod(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; u32 err; err = hw->phy.ops.identify_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module type was detected.\n"); return; } err = hw->mac.ops.setup_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Setup failure - unsupported SFP+ module type.\n"); return; } taskqueue_enqueue(adapter->tq, &adapter->msf_task); return; } /* ** Tasklet for handling MSF (multispeed fiber) interrupts */ static void ixgbe_handle_msf(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; u32 autoneg; bool negotiate; autoneg = hw->phy.autoneg_advertised; if ((!autoneg) && (hw->mac.ops.get_link_capabilities)) hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiate); if (hw->mac.ops.setup_link) hw->mac.ops.setup_link(hw, autoneg, TRUE); return; } #ifdef IXGBE_FDIR /* ** Tasklet for reinitializing the Flow Director filter table */ static void ixgbe_reinit_fdir(void *context, int pending) { struct adapter *adapter = context; struct ifnet *ifp = adapter->ifp; if (adapter->fdir_reinit != 1) /* Shouldn't happen */ return; ixgbe_reinit_fdir_tables_82599(&adapter->hw); adapter->fdir_reinit = 0; /* re-enable flow director interrupts */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMS, IXGBE_EIMS_FLOW_DIR); /* Restart the interface */ ifp->if_drv_flags |= IFF_DRV_RUNNING; return; } #endif /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void ixgbe_update_stats_counters(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 missed_rx = 0, bprc, lxon, lxoff, total; adapter->stats.crcerrs += IXGBE_READ_REG(hw, IXGBE_CRCERRS); adapter->stats.illerrc += IXGBE_READ_REG(hw, IXGBE_ILLERRC); adapter->stats.errbc += IXGBE_READ_REG(hw, IXGBE_ERRBC); adapter->stats.mspdc += IXGBE_READ_REG(hw, IXGBE_MSPDC); /* ** Note: these are for the 8 possible traffic classes, ** which in current implementation is unused, ** therefore only 0 should read real data. */ for (int i = 0; i < 8; i++) { u32 mp; mp = IXGBE_READ_REG(hw, IXGBE_MPC(i)); /* missed_rx tallies misses for the gprc workaround */ missed_rx += mp; /* global total per queue */ adapter->stats.mpc[i] += mp; if (hw->mac.type == ixgbe_mac_82598EB) { adapter->stats.rnbc[i] += IXGBE_READ_REG(hw, IXGBE_RNBC(i)); adapter->stats.qbtc[i] += IXGBE_READ_REG(hw, IXGBE_QBTC(i)); adapter->stats.qbrc[i] += IXGBE_READ_REG(hw, IXGBE_QBRC(i)); adapter->stats.pxonrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONRXC(i)); } else adapter->stats.pxonrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONRXCNT(i)); adapter->stats.pxontxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONTXC(i)); adapter->stats.pxofftxc[i] += IXGBE_READ_REG(hw, IXGBE_PXOFFTXC(i)); adapter->stats.pxoffrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXOFFRXC(i)); adapter->stats.pxon2offc[i] += IXGBE_READ_REG(hw, IXGBE_PXON2OFFCNT(i)); } for (int i = 0; i < 16; i++) { adapter->stats.qprc[i] += IXGBE_READ_REG(hw, IXGBE_QPRC(i)); adapter->stats.qptc[i] += IXGBE_READ_REG(hw, IXGBE_QPTC(i)); adapter->stats.qprdc[i] += IXGBE_READ_REG(hw, IXGBE_QPRDC(i)); } adapter->stats.mlfc += IXGBE_READ_REG(hw, IXGBE_MLFC); adapter->stats.mrfc += IXGBE_READ_REG(hw, IXGBE_MRFC); adapter->stats.rlec += IXGBE_READ_REG(hw, IXGBE_RLEC); /* Hardware workaround, gprc counts missed packets */ adapter->stats.gprc += IXGBE_READ_REG(hw, IXGBE_GPRC); adapter->stats.gprc -= missed_rx; if (hw->mac.type != ixgbe_mac_82598EB) { adapter->stats.gorc += IXGBE_READ_REG(hw, IXGBE_GORCL) + ((u64)IXGBE_READ_REG(hw, IXGBE_GORCH) << 32); adapter->stats.gotc += IXGBE_READ_REG(hw, IXGBE_GOTCL) + ((u64)IXGBE_READ_REG(hw, IXGBE_GOTCH) << 32); adapter->stats.tor += IXGBE_READ_REG(hw, IXGBE_TORL) + ((u64)IXGBE_READ_REG(hw, IXGBE_TORH) << 32); adapter->stats.lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXCNT); adapter->stats.lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXCNT); } else { adapter->stats.lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXC); adapter->stats.lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXC); /* 82598 only has a counter in the high register */ adapter->stats.gorc += IXGBE_READ_REG(hw, IXGBE_GORCH); adapter->stats.gotc += IXGBE_READ_REG(hw, IXGBE_GOTCH); adapter->stats.tor += IXGBE_READ_REG(hw, IXGBE_TORH); } /* * Workaround: mprc hardware is incorrectly counting * broadcasts, so for now we subtract those. */ bprc = IXGBE_READ_REG(hw, IXGBE_BPRC); adapter->stats.bprc += bprc; adapter->stats.mprc += IXGBE_READ_REG(hw, IXGBE_MPRC); if (hw->mac.type == ixgbe_mac_82598EB) adapter->stats.mprc -= bprc; adapter->stats.prc64 += IXGBE_READ_REG(hw, IXGBE_PRC64); adapter->stats.prc127 += IXGBE_READ_REG(hw, IXGBE_PRC127); adapter->stats.prc255 += IXGBE_READ_REG(hw, IXGBE_PRC255); adapter->stats.prc511 += IXGBE_READ_REG(hw, IXGBE_PRC511); adapter->stats.prc1023 += IXGBE_READ_REG(hw, IXGBE_PRC1023); adapter->stats.prc1522 += IXGBE_READ_REG(hw, IXGBE_PRC1522); lxon = IXGBE_READ_REG(hw, IXGBE_LXONTXC); adapter->stats.lxontxc += lxon; lxoff = IXGBE_READ_REG(hw, IXGBE_LXOFFTXC); adapter->stats.lxofftxc += lxoff; total = lxon + lxoff; adapter->stats.gptc += IXGBE_READ_REG(hw, IXGBE_GPTC); adapter->stats.mptc += IXGBE_READ_REG(hw, IXGBE_MPTC); adapter->stats.ptc64 += IXGBE_READ_REG(hw, IXGBE_PTC64); adapter->stats.gptc -= total; adapter->stats.mptc -= total; adapter->stats.ptc64 -= total; adapter->stats.gotc -= total * ETHER_MIN_LEN; adapter->stats.ruc += IXGBE_READ_REG(hw, IXGBE_RUC); adapter->stats.rfc += IXGBE_READ_REG(hw, IXGBE_RFC); adapter->stats.roc += IXGBE_READ_REG(hw, IXGBE_ROC); adapter->stats.rjc += IXGBE_READ_REG(hw, IXGBE_RJC); adapter->stats.mngprc += IXGBE_READ_REG(hw, IXGBE_MNGPRC); adapter->stats.mngpdc += IXGBE_READ_REG(hw, IXGBE_MNGPDC); adapter->stats.mngptc += IXGBE_READ_REG(hw, IXGBE_MNGPTC); adapter->stats.tpr += IXGBE_READ_REG(hw, IXGBE_TPR); adapter->stats.tpt += IXGBE_READ_REG(hw, IXGBE_TPT); adapter->stats.ptc127 += IXGBE_READ_REG(hw, IXGBE_PTC127); adapter->stats.ptc255 += IXGBE_READ_REG(hw, IXGBE_PTC255); adapter->stats.ptc511 += IXGBE_READ_REG(hw, IXGBE_PTC511); adapter->stats.ptc1023 += IXGBE_READ_REG(hw, IXGBE_PTC1023); adapter->stats.ptc1522 += IXGBE_READ_REG(hw, IXGBE_PTC1522); adapter->stats.bptc += IXGBE_READ_REG(hw, IXGBE_BPTC); adapter->stats.xec += IXGBE_READ_REG(hw, IXGBE_XEC); adapter->stats.fccrc += IXGBE_READ_REG(hw, IXGBE_FCCRC); adapter->stats.fclast += IXGBE_READ_REG(hw, IXGBE_FCLAST); /* Only read FCOE on 82599 */ if (hw->mac.type != ixgbe_mac_82598EB) { adapter->stats.fcoerpdc += IXGBE_READ_REG(hw, IXGBE_FCOERPDC); adapter->stats.fcoeprc += IXGBE_READ_REG(hw, IXGBE_FCOEPRC); adapter->stats.fcoeptc += IXGBE_READ_REG(hw, IXGBE_FCOEPTC); adapter->stats.fcoedwrc += IXGBE_READ_REG(hw, IXGBE_FCOEDWRC); adapter->stats.fcoedwtc += IXGBE_READ_REG(hw, IXGBE_FCOEDWTC); } } static uint64_t ixgbe_get_counter(struct ifnet *ifp, ift_counter cnt) { struct adapter *adapter; uint64_t rv; adapter = if_getsoftc(ifp); switch (cnt) { case IFCOUNTER_IPACKETS: return (adapter->stats.gprc); case IFCOUNTER_OPACKETS: return (adapter->stats.gptc); case IFCOUNTER_IBYTES: return (adapter->stats.gorc); case IFCOUNTER_OBYTES: return (adapter->stats.gotc); case IFCOUNTER_IMCASTS: return (adapter->stats.mprc); case IFCOUNTER_OMCASTS: return (adapter->stats.mptc); case IFCOUNTER_COLLISIONS: return (0); case IFCOUNTER_IQDROPS: rv = 0; for (int i = 0; i < 8; i++) rv += adapter->stats.mpc[i]; return (rv); case IFCOUNTER_IERRORS: return (adapter->stats.crcerrs + adapter->stats.rlec); default: return (if_get_counter_default(ifp, cnt)); } } /** ixgbe_sysctl_tdh_handler - Handler function * Retrieves the TDH value from the hardware */ static int ixgbe_sysctl_tdh_handler(SYSCTL_HANDLER_ARGS) { int error; struct tx_ring *txr = ((struct tx_ring *)oidp->oid_arg1); if (!txr) return 0; unsigned val = IXGBE_READ_REG(&txr->adapter->hw, IXGBE_TDH(txr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_tdt_handler - Handler function * Retrieves the TDT value from the hardware */ static int ixgbe_sysctl_tdt_handler(SYSCTL_HANDLER_ARGS) { int error; struct tx_ring *txr = ((struct tx_ring *)oidp->oid_arg1); if (!txr) return 0; unsigned val = IXGBE_READ_REG(&txr->adapter->hw, IXGBE_TDT(txr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_rdh_handler - Handler function * Retrieves the RDH value from the hardware */ static int ixgbe_sysctl_rdh_handler(SYSCTL_HANDLER_ARGS) { int error; struct rx_ring *rxr = ((struct rx_ring *)oidp->oid_arg1); if (!rxr) return 0; unsigned val = IXGBE_READ_REG(&rxr->adapter->hw, IXGBE_RDH(rxr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_rdt_handler - Handler function * Retrieves the RDT value from the hardware */ static int ixgbe_sysctl_rdt_handler(SYSCTL_HANDLER_ARGS) { int error; struct rx_ring *rxr = ((struct rx_ring *)oidp->oid_arg1); if (!rxr) return 0; unsigned val = IXGBE_READ_REG(&rxr->adapter->hw, IXGBE_RDT(rxr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } static int ixgbe_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) { int error; struct ix_queue *que = ((struct ix_queue *)oidp->oid_arg1); unsigned int reg, usec, rate; reg = IXGBE_READ_REG(&que->adapter->hw, IXGBE_EITR(que->msix)); usec = ((reg & 0x0FF8) >> 3); if (usec > 0) rate = 500000 / usec; else rate = 0; error = sysctl_handle_int(oidp, &rate, 0, req); if (error || !req->newptr) return error; reg &= ~0xfff; /* default, no limitation */ ixgbe_max_interrupt_rate = 0; if (rate > 0 && rate < 500000) { if (rate < 1000) rate = 1000; ixgbe_max_interrupt_rate = rate; reg |= ((4000000/rate) & 0xff8 ); } IXGBE_WRITE_REG(&que->adapter->hw, IXGBE_EITR(que->msix), reg); return 0; } /* * Add sysctl variables, one per statistic, to the system. */ static void ixgbe_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct ixgbe_hw_stats *stats = &adapter->stats; struct sysctl_oid *stat_node, *queue_node; struct sysctl_oid_list *stat_list, *queue_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_defrag_failed", CTLFLAG_RD, &adapter->mbuf_defrag_failed, "m_defrag() failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_events", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, "Link MSIX IRQ Handled"); for (int i = 0; i < adapter->num_queues; i++, txr++) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", CTLTYPE_UINT | CTLFLAG_RW, &adapter->queues[i], sizeof(&adapter->queues[i]), ixgbe_sysctl_interrupt_rate_handler, "IU", "Interrupt Rate"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "irqs", CTLFLAG_RD, &(adapter->queues[i].irqs), "irqs on this queue"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr), ixgbe_sysctl_tdh_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_tail", CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr), ixgbe_sysctl_tdt_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "tso_tx", CTLFLAG_RD, &txr->tso_tx, "TSO"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "no_tx_dma_setup", CTLFLAG_RD, &txr->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue No Descriptor Available"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &txr->total_packets, "Queue Packets Transmitted"); } for (int i = 0; i < adapter->num_queues; i++, rxr++) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); struct lro_ctrl *lro = &rxr->lro; snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, rxr, sizeof(rxr), ixgbe_sysctl_rdh_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_tail", CTLTYPE_UINT | CTLFLAG_RD, rxr, sizeof(rxr), ixgbe_sysctl_rdt_handler, "IU", "Receive Descriptor Tail"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &rxr->rx_packets, "Queue Packets Received"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_copies", CTLFLAG_RD, &rxr->rx_copies, "Copied RX Frames"); SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_flushed", CTLFLAG_RD, &lro->lro_flushed, 0, "LRO Flushed"); } /* MAC stats get the own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &stats->crcerrs, "CRC Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "ill_errs", CTLFLAG_RD, &stats->illerrc, "Illegal Byte Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "byte_errs", CTLFLAG_RD, &stats->errbc, "Byte Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "short_discards", CTLFLAG_RD, &stats->mspdc, "MAC Short Packets Discarded"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "local_faults", CTLFLAG_RD, &stats->mlfc, "MAC Local Faults"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "remote_faults", CTLFLAG_RD, &stats->mrfc, "MAC Remote Faults"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rec_len_errs", CTLFLAG_RD, &stats->rlec, "Receive Length Errors"); /* Flow Control stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &stats->lxontxc, "Link XON Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &stats->lxonrxc, "Link XON Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &stats->lxofftxc, "Link XOFF Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &stats->lxoffrxc, "Link XOFF Received"); /* Packet Reception Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_octets_rcvd", CTLFLAG_RD, &stats->tor, "Total Octets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_rcvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_rcvd", CTLFLAG_RD, &stats->tpr, "Total Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_rcvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_rcvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_rcvd", CTLFLAG_RD, &stats->bprc, "Broadcast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &stats->prc64, "64 byte frames received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &stats->prc127, "65-127 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &stats->prc255, "128-255 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &stats->prc511, "256-511 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &stats->prc1023, "512-1023 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &stats->prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_undersized", CTLFLAG_RD, &stats->ruc, "Receive Undersized"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &stats->rfc, "Fragmented Packets Received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_oversized", CTLFLAG_RD, &stats->roc, "Oversized Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_jabberd", CTLFLAG_RD, &stats->rjc, "Received Jabber"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_rcvd", CTLFLAG_RD, &stats->mngprc, "Management Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_drpd", CTLFLAG_RD, &stats->mngptc, "Management Packets Dropped"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "checksum_errs", CTLFLAG_RD, &stats->xec, "Checksum Errors"); /* Packet Transmission Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &stats->tpt, "Total Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &stats->bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &stats->mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_txd", CTLFLAG_RD, &stats->mngptc, "Management Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &stats->ptc64, "64 byte frames transmitted "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &stats->ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &stats->ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &stats->ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &stats->ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &stats->ptc1522, "1024-1522 byte frames transmitted"); } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error, last; struct adapter *adapter = (struct adapter *) arg1; last = adapter->fc; error = sysctl_handle_int(oidp, &adapter->fc, 0, req); if ((error) || (req->newptr == NULL)) return (error); /* Don't bother if it's not changed */ if (adapter->fc == last) return (0); switch (adapter->fc) { case ixgbe_fc_rx_pause: case ixgbe_fc_tx_pause: case ixgbe_fc_full: adapter->hw.fc.requested_mode = adapter->fc; if (adapter->num_queues > 1) ixgbe_disable_rx_drop(adapter); break; case ixgbe_fc_none: adapter->hw.fc.requested_mode = ixgbe_fc_none; if (adapter->num_queues > 1) ixgbe_enable_rx_drop(adapter); break; default: adapter->fc = last; return (EINVAL); } /* Don't autoneg if forcing a value */ adapter->hw.fc.disable_fc_autoneg = TRUE; ixgbe_fc_enable(&adapter->hw); return error; } static int ixgbe_per_unit_num_queues(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; adapter = (struct adapter *) arg1; return sysctl_handle_int(oidp, &adapter->num_queues, 0, req); } /* ** Control link advertise speed: ** 1 - advertise only 1G ** 2 - advertise 100Mb ** 3 - advertise normal */ static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS) { int error = 0; struct adapter *adapter; device_t dev; struct ixgbe_hw *hw; ixgbe_link_speed speed, last; adapter = (struct adapter *) arg1; dev = adapter->dev; hw = &adapter->hw; last = adapter->advertise; error = sysctl_handle_int(oidp, &adapter->advertise, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (adapter->advertise == last) /* no change */ return (0); if (!((hw->phy.media_type == ixgbe_media_type_copper) || (hw->phy.multispeed_fiber))) return (EINVAL); if ((adapter->advertise == 2) && (hw->mac.type != ixgbe_mac_X540)) { device_printf(dev, "Set Advertise: 100Mb on X540 only\n"); return (EINVAL); } if (adapter->advertise == 1) speed = IXGBE_LINK_SPEED_1GB_FULL; else if (adapter->advertise == 2) speed = IXGBE_LINK_SPEED_100_FULL; else if (adapter->advertise == 3) speed = IXGBE_LINK_SPEED_1GB_FULL | IXGBE_LINK_SPEED_10GB_FULL; else { /* bogus value */ adapter->advertise = last; return (EINVAL); } hw->mac.autotry_restart = TRUE; hw->mac.ops.setup_link(hw, speed, TRUE); return (error); } /* ** Thermal Shutdown Trigger ** - cause a Thermal Overtemp IRQ ** - this now requires firmware enabling */ static int ixgbe_set_thermal_test(SYSCTL_HANDLER_ARGS) { int error, fire = 0; struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; if (hw->mac.type != ixgbe_mac_X540) return (0); error = sysctl_handle_int(oidp, &fire, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (fire) { u32 reg = IXGBE_READ_REG(hw, IXGBE_EICS); reg |= IXGBE_EICR_TS; IXGBE_WRITE_REG(hw, IXGBE_EICS, reg); } return (0); } /* ** Enable the hardware to drop packets when the buffer is ** full. This is useful when multiqueue,so that no single ** queue being full stalls the entire RX engine. We only ** enable this when Multiqueue AND when Flow Control is ** disabled. */ static void ixgbe_enable_rx_drop(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++) { u32 srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(i)); srrctl |= IXGBE_SRRCTL_DROP_EN; IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(i), srrctl); } } static void ixgbe_disable_rx_drop(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++) { u32 srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(i)); srrctl &= ~IXGBE_SRRCTL_DROP_EN; IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(i), srrctl); } } Index: head/sys/dev/ixgbe/ixv.c =================================================================== --- head/sys/dev/ixgbe/ixv.c (revision 275357) +++ head/sys/dev/ixgbe/ixv.c (revision 275358) @@ -1,4006 +1,4006 @@ /****************************************************************************** Copyright (c) 2001-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #include "ixv.h" /********************************************************************* * Driver version *********************************************************************/ char ixv_driver_version[] = "1.1.4"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into ixv_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static ixv_vendor_info_t ixv_vendor_info_array[] = { {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_VF, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X540_VF, 0, 0, 0}, /* required last entry */ {0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings *********************************************************************/ static char *ixv_strings[] = { "Intel(R) PRO/10GbE Virtual Function Network Driver" }; /********************************************************************* * Function prototypes *********************************************************************/ static int ixv_probe(device_t); static int ixv_attach(device_t); static int ixv_detach(device_t); static int ixv_shutdown(device_t); #if __FreeBSD_version < 800000 static void ixv_start(struct ifnet *); static void ixv_start_locked(struct tx_ring *, struct ifnet *); #else static int ixv_mq_start(struct ifnet *, struct mbuf *); static int ixv_mq_start_locked(struct ifnet *, struct tx_ring *, struct mbuf *); static void ixv_qflush(struct ifnet *); #endif static int ixv_ioctl(struct ifnet *, u_long, caddr_t); static void ixv_init(void *); static void ixv_init_locked(struct adapter *); static void ixv_stop(void *); static void ixv_media_status(struct ifnet *, struct ifmediareq *); static int ixv_media_change(struct ifnet *); static void ixv_identify_hardware(struct adapter *); static int ixv_allocate_pci_resources(struct adapter *); static int ixv_allocate_msix(struct adapter *); static int ixv_allocate_queues(struct adapter *); static int ixv_setup_msix(struct adapter *); static void ixv_free_pci_resources(struct adapter *); static void ixv_local_timer(void *); static void ixv_setup_interface(device_t, struct adapter *); static void ixv_config_link(struct adapter *); static int ixv_allocate_transmit_buffers(struct tx_ring *); static int ixv_setup_transmit_structures(struct adapter *); static void ixv_setup_transmit_ring(struct tx_ring *); static void ixv_initialize_transmit_units(struct adapter *); static void ixv_free_transmit_structures(struct adapter *); static void ixv_free_transmit_buffers(struct tx_ring *); static int ixv_allocate_receive_buffers(struct rx_ring *); static int ixv_setup_receive_structures(struct adapter *); static int ixv_setup_receive_ring(struct rx_ring *); static void ixv_initialize_receive_units(struct adapter *); static void ixv_free_receive_structures(struct adapter *); static void ixv_free_receive_buffers(struct rx_ring *); static void ixv_enable_intr(struct adapter *); static void ixv_disable_intr(struct adapter *); static bool ixv_txeof(struct tx_ring *); static bool ixv_rxeof(struct ix_queue *, int); static void ixv_rx_checksum(u32, struct mbuf *, u32); static void ixv_set_multi(struct adapter *); static void ixv_update_link_status(struct adapter *); static void ixv_refresh_mbufs(struct rx_ring *, int); static int ixv_xmit(struct tx_ring *, struct mbuf **); static int ixv_sysctl_stats(SYSCTL_HANDLER_ARGS); static int ixv_sysctl_debug(SYSCTL_HANDLER_ARGS); static int ixv_set_flowcntl(SYSCTL_HANDLER_ARGS); static int ixv_dma_malloc(struct adapter *, bus_size_t, struct ixv_dma_alloc *, int); static void ixv_dma_free(struct adapter *, struct ixv_dma_alloc *); static void ixv_add_rx_process_limit(struct adapter *, const char *, const char *, int *, int); static bool ixv_tx_ctx_setup(struct tx_ring *, struct mbuf *); static bool ixv_tso_setup(struct tx_ring *, struct mbuf *, u32 *); static void ixv_set_ivar(struct adapter *, u8, u8, s8); static void ixv_configure_ivars(struct adapter *); static u8 * ixv_mc_array_itr(struct ixgbe_hw *, u8 **, u32 *); static void ixv_setup_vlan_support(struct adapter *); static void ixv_register_vlan(void *, struct ifnet *, u16); static void ixv_unregister_vlan(void *, struct ifnet *, u16); static void ixv_save_stats(struct adapter *); static void ixv_init_stats(struct adapter *); static void ixv_update_stats(struct adapter *); static __inline void ixv_rx_discard(struct rx_ring *, int); static __inline void ixv_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u32); /* The MSI/X Interrupt handlers */ static void ixv_msix_que(void *); static void ixv_msix_mbx(void *); /* Deferred interrupt tasklets */ static void ixv_handle_que(void *, int); static void ixv_handle_mbx(void *, int); /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t ixv_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ixv_probe), DEVMETHOD(device_attach, ixv_attach), DEVMETHOD(device_detach, ixv_detach), DEVMETHOD(device_shutdown, ixv_shutdown), DEVMETHOD_END }; static driver_t ixv_driver = { "ix", ixv_methods, sizeof(struct adapter), }; extern devclass_t ixgbe_devclass; DRIVER_MODULE(ixv, pci, ixv_driver, ixgbe_devclass, 0, 0); MODULE_DEPEND(ixv, pci, 1, 1, 1); MODULE_DEPEND(ixv, ether, 1, 1, 1); /* ** TUNEABLE PARAMETERS: */ /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int ixv_enable_aim = FALSE; TUNABLE_INT("hw.ixv.enable_aim", &ixv_enable_aim); /* How many packets rxeof tries to clean at a time */ static int ixv_rx_process_limit = 128; TUNABLE_INT("hw.ixv.rx_process_limit", &ixv_rx_process_limit); /* Flow control setting, default to full */ static int ixv_flow_control = ixgbe_fc_full; TUNABLE_INT("hw.ixv.flow_control", &ixv_flow_control); /* * Header split: this causes the hardware to DMA * the header into a seperate mbuf from the payload, * it can be a performance win in some workloads, but * in others it actually hurts, its off by default. */ static int ixv_header_split = FALSE; TUNABLE_INT("hw.ixv.hdr_split", &ixv_header_split); /* ** Number of TX descriptors per ring, ** setting higher than RX as this seems ** the better performing choice. */ static int ixv_txd = DEFAULT_TXD; TUNABLE_INT("hw.ixv.txd", &ixv_txd); /* Number of RX descriptors per ring */ static int ixv_rxd = DEFAULT_RXD; TUNABLE_INT("hw.ixv.rxd", &ixv_rxd); /* ** Shadow VFTA table, this is needed because ** the real filter table gets cleared during ** a soft reset and we need to repopulate it. */ static u32 ixv_shadow_vfta[VFTA_SIZE]; /********************************************************************* * Device identification routine * * ixv_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int ixv_probe(device_t dev) { ixv_vendor_info_t *ent; u16 pci_vendor_id = 0; u16 pci_device_id = 0; u16 pci_subvendor_id = 0; u16 pci_subdevice_id = 0; char adapter_name[256]; pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IXGBE_INTEL_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = ixv_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == 0)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == 0))) { sprintf(adapter_name, "%s, Version - %s", ixv_strings[ent->index], ixv_driver_version); device_set_desc_copy(dev, adapter_name); return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int ixv_attach(device_t dev) { struct adapter *adapter; struct ixgbe_hw *hw; int error = 0; INIT_DEBUGOUT("ixv_attach: begin"); /* Allocate, clear, and link in our adapter structure */ adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; hw = &adapter->hw; /* Core Lock Init*/ IXV_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL APIs */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "stats", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixv_sysctl_stats, "I", "Statistics"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "debug", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixv_sysctl_debug, "I", "Debug Info"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "flow_control", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixv_set_flowcntl, "I", "Flow Control"); SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "enable_aim", CTLFLAG_RW, &ixv_enable_aim, 1, "Interrupt Moderation"); /* Set up the timer callout */ callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware revision */ ixv_identify_hardware(adapter); /* Do base PCI setup - map BAR0 */ if (ixv_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_out; } /* Do descriptor calc and sanity checks */ if (((ixv_txd * sizeof(union ixgbe_adv_tx_desc)) % DBA_ALIGN) != 0 || ixv_txd < MIN_TXD || ixv_txd > MAX_TXD) { device_printf(dev, "TXD config issue, using default!\n"); adapter->num_tx_desc = DEFAULT_TXD; } else adapter->num_tx_desc = ixv_txd; if (((ixv_rxd * sizeof(union ixgbe_adv_rx_desc)) % DBA_ALIGN) != 0 || ixv_rxd < MIN_RXD || ixv_rxd > MAX_RXD) { device_printf(dev, "RXD config issue, using default!\n"); adapter->num_rx_desc = DEFAULT_RXD; } else adapter->num_rx_desc = ixv_rxd; /* Allocate our TX/RX Queues */ if (ixv_allocate_queues(adapter)) { error = ENOMEM; goto err_out; } /* ** Initialize the shared code: its ** at this point the mac type is set. */ error = ixgbe_init_shared_code(hw); if (error) { device_printf(dev,"Shared Code Initialization Failure\n"); error = EIO; goto err_late; } /* Setup the mailbox */ ixgbe_init_mbx_params_vf(hw); ixgbe_reset_hw(hw); /* Get Hardware Flow Control setting */ hw->fc.requested_mode = ixgbe_fc_full; hw->fc.pause_time = IXV_FC_PAUSE; hw->fc.low_water[0] = IXV_FC_LO; hw->fc.high_water[0] = IXV_FC_HI; hw->fc.send_xon = TRUE; error = ixgbe_init_hw(hw); if (error) { device_printf(dev,"Hardware Initialization Failure\n"); error = EIO; goto err_late; } error = ixv_allocate_msix(adapter); if (error) goto err_late; /* Setup OS specific network interface */ ixv_setup_interface(dev, adapter); /* Sysctl for limiting the amount of work done in the taskqueue */ ixv_add_rx_process_limit(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, ixv_rx_process_limit); /* Do the stats setup */ ixv_save_stats(adapter); ixv_init_stats(adapter); /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ixv_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ixv_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); INIT_DEBUGOUT("ixv_attach: end"); return (0); err_late: ixv_free_transmit_structures(adapter); ixv_free_receive_structures(adapter); err_out: ixv_free_pci_resources(adapter); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int ixv_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ix_queue *que = adapter->queues; INIT_DEBUGOUT("ixv_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } IXV_CORE_LOCK(adapter); ixv_stop(adapter); IXV_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++, que++) { if (que->tq) { taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } /* Drain the Link queue */ if (adapter->tq) { taskqueue_drain(adapter->tq, &adapter->mbx_task); taskqueue_free(adapter->tq); } /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); ether_ifdetach(adapter->ifp); callout_drain(&adapter->timer); ixv_free_pci_resources(adapter); bus_generic_detach(dev); if_free(adapter->ifp); ixv_free_transmit_structures(adapter); ixv_free_receive_structures(adapter); IXV_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int ixv_shutdown(device_t dev) { struct adapter *adapter = device_get_softc(dev); IXV_CORE_LOCK(adapter); ixv_stop(adapter); IXV_CORE_UNLOCK(adapter); return (0); } #if __FreeBSD_version < 800000 /********************************************************************* * Transmit entry point * * ixv_start is called by the stack to initiate a transmit. * The driver will remain in this routine as long as there are * packets to transmit and transmit resources are available. * In case resources are not available stack is notified and * the packet is requeued. **********************************************************************/ static void ixv_start_locked(struct tx_ring *txr, struct ifnet * ifp) { struct mbuf *m_head; struct adapter *adapter = txr->adapter; IXV_TX_LOCK_ASSERT(txr); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!adapter->link_active) return; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (ixv_xmit(txr, &m_head)) { if (m_head == NULL) break; ifp->if_drv_flags |= IFF_DRV_OACTIVE; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set watchdog on */ txr->watchdog_check = TRUE; txr->watchdog_time = ticks; } return; } /* * Legacy TX start - called by the stack, this * always uses the first tx ring, and should * not be used with multiqueue tx enabled. */ static void ixv_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXV_TX_LOCK(txr); ixv_start_locked(txr, ifp); IXV_TX_UNLOCK(txr); } return; } #else /* ** Multiqueue Transmit driver ** */ static int ixv_mq_start(struct ifnet *ifp, struct mbuf *m) { struct adapter *adapter = ifp->if_softc; struct ix_queue *que; struct tx_ring *txr; int i = 0, err = 0; /* Which queue to use */ - if ((m->m_flags & M_FLOWID) != 0) + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % adapter->num_queues; txr = &adapter->tx_rings[i]; que = &adapter->queues[i]; if (IXV_TX_TRYLOCK(txr)) { err = ixv_mq_start_locked(ifp, txr, m); IXV_TX_UNLOCK(txr); } else { err = drbr_enqueue(ifp, txr->br, m); taskqueue_enqueue(que->tq, &que->que_task); } return (err); } static int ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m) { struct adapter *adapter = txr->adapter; struct mbuf *next; int enqueued, err = 0; if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || adapter->link_active == 0) { if (m != NULL) err = drbr_enqueue(ifp, txr->br, m); return (err); } /* Do a clean if descriptors are low */ if (txr->tx_avail <= IXV_TX_CLEANUP_THRESHOLD) ixv_txeof(txr); enqueued = 0; if (m != NULL) { err = drbr_enqueue(ifp, txr->br, m); if (err) { return (err); } } /* Process the queue */ while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = ixv_xmit(txr, &next)) != 0) { if (next == NULL) { drbr_advance(ifp, txr->br); } else { drbr_putback(ifp, txr->br, next); } break; } drbr_advance(ifp, txr->br); enqueued++; if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); if (next->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; if (txr->tx_avail <= IXV_TX_OP_THRESHOLD) { ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } } if (enqueued > 0) { /* Set watchdog on */ txr->watchdog_check = TRUE; txr->watchdog_time = ticks; } return (err); } /* ** Flush all ring buffers */ static void ixv_qflush(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct mbuf *m; for (int i = 0; i < adapter->num_queues; i++, txr++) { IXV_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IXV_TX_UNLOCK(txr); } if_qflush(ifp); } #endif /********************************************************************* * Ioctl entry point * * ixv_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int ixv_ioctl(struct ifnet * ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *) data; bool avoid_reset = FALSE; #endif int error = 0; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif #if defined(INET) || defined(INET6) /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ixv_init(adapter); if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); } else error = ether_ioctl(ifp, command, data); break; #endif case SIOCSIFMTU: IOCTL_DEBUGOUT("ioctl: SIOCSIFMTU (Set Interface MTU)"); if (ifr->ifr_mtu > IXV_MAX_FRAME_SIZE - ETHER_HDR_LEN) { error = EINVAL; } else { IXV_CORE_LOCK(adapter); ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; ixv_init_locked(adapter); IXV_CORE_UNLOCK(adapter); } break; case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl: SIOCSIFFLAGS (Set Interface Flags)"); IXV_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ixv_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ixv_stop(adapter); adapter->if_flags = ifp->if_flags; IXV_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXV_CORE_LOCK(adapter); ixv_disable_intr(adapter); ixv_set_multi(adapter); ixv_enable_intr(adapter); IXV_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl: SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask = ifr->ifr_reqcap ^ ifp->if_capenable; IOCTL_DEBUGOUT("ioctl: SIOCSIFCAP (Set Capabilities)"); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXV_CORE_LOCK(adapter); ixv_init_locked(adapter); IXV_CORE_UNLOCK(adapter); } VLAN_CAPABILITIES(ifp); break; } default: IOCTL_DEBUGOUT1("ioctl: UNKNOWN (0x%X)\n", (int)command); error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ #define IXGBE_MHADD_MFS_SHIFT 16 static void ixv_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; u32 mhadd, gpie; INIT_DEBUGOUT("ixv_init: begin"); mtx_assert(&adapter->core_mtx, MA_OWNED); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); callout_stop(&adapter->timer); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), hw->mac.addr, IXGBE_ETH_LENGTH_OF_ADDRESS); ixgbe_set_rar(hw, 0, hw->mac.addr, 0, 1); hw->addr_ctrl.rar_used_count = 1; /* Prepare transmit descriptors and buffers */ if (ixv_setup_transmit_structures(adapter)) { device_printf(dev,"Could not setup transmit structures\n"); ixv_stop(adapter); return; } ixgbe_reset_hw(hw); ixv_initialize_transmit_units(adapter); /* Setup Multicast table */ ixv_set_multi(adapter); /* ** Determine the correct mbuf pool ** for doing jumbo/headersplit */ if (ifp->if_mtu > ETHERMTU) adapter->rx_mbuf_sz = MJUMPAGESIZE; else adapter->rx_mbuf_sz = MCLBYTES; /* Prepare receive descriptors and buffers */ if (ixv_setup_receive_structures(adapter)) { device_printf(dev,"Could not setup receive structures\n"); ixv_stop(adapter); return; } /* Configure RX settings */ ixv_initialize_receive_units(adapter); /* Enable Enhanced MSIX mode */ gpie = IXGBE_READ_REG(&adapter->hw, IXGBE_GPIE); gpie |= IXGBE_GPIE_MSIX_MODE | IXGBE_GPIE_EIAME; gpie |= IXGBE_GPIE_PBA_SUPPORT | IXGBE_GPIE_OCD; IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie); /* Set the various hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 ifp->if_hwassist |= CSUM_SCTP; #endif } /* Set MTU size */ if (ifp->if_mtu > ETHERMTU) { mhadd = IXGBE_READ_REG(hw, IXGBE_MHADD); mhadd &= ~IXGBE_MHADD_MFS_MASK; mhadd |= adapter->max_frame_size << IXGBE_MHADD_MFS_SHIFT; IXGBE_WRITE_REG(hw, IXGBE_MHADD, mhadd); } /* Set up VLAN offload and filter */ ixv_setup_vlan_support(adapter); callout_reset(&adapter->timer, hz, ixv_local_timer, adapter); /* Set up MSI/X routing */ ixv_configure_ivars(adapter); /* Set up auto-mask */ IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, IXGBE_EICS_RTX_QUEUE); /* Set moderation on the Link interrupt */ IXGBE_WRITE_REG(hw, IXGBE_VTEITR(adapter->mbxvec), IXV_LINK_ITR); /* Stats init */ ixv_init_stats(adapter); /* Config/Enable Link */ ixv_config_link(adapter); /* And now turn on interrupts */ ixv_enable_intr(adapter); /* Now inform the stack we're ready */ ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; return; } static void ixv_init(void *arg) { struct adapter *adapter = arg; IXV_CORE_LOCK(adapter); ixv_init_locked(adapter); IXV_CORE_UNLOCK(adapter); return; } /* ** ** MSIX Interrupt Handlers and Tasklets ** */ static inline void ixv_enable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u32 queue = 1 << vector; u32 mask; mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask); } static inline void ixv_disable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_VTEIMC, mask); } static inline void ixv_rearm_queues(struct adapter *adapter, u64 queues) { u32 mask = (IXGBE_EIMS_RTX_QUEUE & queues); IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEICS, mask); } static void ixv_handle_que(void *context, int pending) { struct ix_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; bool more; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { more = ixv_rxeof(que, adapter->rx_process_limit); IXV_TX_LOCK(txr); ixv_txeof(txr); #if __FreeBSD_version >= 800000 if (!drbr_empty(ifp, txr->br)) ixv_mq_start_locked(ifp, txr, NULL); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixv_start_locked(txr, ifp); #endif IXV_TX_UNLOCK(txr); if (more) { taskqueue_enqueue(que->tq, &que->que_task); return; } } /* Reenable this interrupt */ ixv_enable_queue(adapter, que->msix); return; } /********************************************************************* * * MSI Queue Interrupt Service routine * **********************************************************************/ void ixv_msix_que(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; bool more_tx, more_rx; u32 newitr = 0; ixv_disable_queue(adapter, que->msix); ++que->irqs; more_rx = ixv_rxeof(que, adapter->rx_process_limit); IXV_TX_LOCK(txr); more_tx = ixv_txeof(txr); /* ** Make certain that if the stack ** has anything queued the task gets ** scheduled to handle it. */ #if __FreeBSD_version < 800000 if (!IFQ_DRV_IS_EMPTY(&adapter->ifp->if_snd)) #else if (!drbr_empty(adapter->ifp, txr->br)) #endif more_tx = 1; IXV_TX_UNLOCK(txr); more_rx = ixv_rxeof(que, adapter->rx_process_limit); /* Do AIM now? */ if (ixv_enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); newitr |= newitr << 16; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: if (more_tx || more_rx) taskqueue_enqueue(que->tq, &que->que_task); else /* Reenable this interrupt */ ixv_enable_queue(adapter, que->msix); return; } static void ixv_msix_mbx(void *arg) { struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; u32 reg; ++adapter->mbx_irq; /* First get the cause */ reg = IXGBE_READ_REG(hw, IXGBE_VTEICS); /* Clear interrupt with write */ IXGBE_WRITE_REG(hw, IXGBE_VTEICR, reg); /* Link status change */ if (reg & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->mbx_task); IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, IXGBE_EIMS_OTHER); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void ixv_media_status(struct ifnet * ifp, struct ifmediareq * ifmr) { struct adapter *adapter = ifp->if_softc; INIT_DEBUGOUT("ixv_media_status: begin"); IXV_CORE_LOCK(adapter); ixv_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IXV_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; switch (adapter->link_speed) { case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_T | IFM_FDX; break; case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_FDX; break; } IXV_CORE_UNLOCK(adapter); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int ixv_media_change(struct ifnet * ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("ixv_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: break; default: device_printf(adapter->dev, "Only auto media type\n"); return (EINVAL); } return (0); } /********************************************************************* * * This routine maps the mbufs to tx descriptors, allowing the * TX engine to transmit the packets. * - return 0 on success, positive on failure * **********************************************************************/ static int ixv_xmit(struct tx_ring *txr, struct mbuf **m_headp) { struct adapter *adapter = txr->adapter; u32 olinfo_status = 0, cmd_type_len; u32 paylen = 0; int i, j, error, nsegs; int first, last = 0; struct mbuf *m_head; bus_dma_segment_t segs[32]; bus_dmamap_t map; struct ixv_tx_buf *txbuf, *txbuf_mapped; union ixgbe_adv_tx_desc *txd = NULL; m_head = *m_headp; /* Basic descriptor defines */ cmd_type_len = (IXGBE_ADVTXD_DTYP_DATA | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT); if (m_head->m_flags & M_VLANTAG) cmd_type_len |= IXGBE_ADVTXD_DCMD_VLE; /* * Important to capture the first descriptor * used because it will contain the index of * the one we tell the hardware to report back */ first = txr->next_avail_desc; txbuf = &txr->tx_buffers[first]; txbuf_mapped = txbuf; map = txbuf->map; /* * Map the packet for DMA. */ error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m; m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; /* Try it again */ error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (error == ENOMEM) { adapter->no_tx_dma_setup++; return (error); } else if (error != 0) { adapter->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } else if (error == ENOMEM) { adapter->no_tx_dma_setup++; return (error); } else if (error != 0) { adapter->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } /* Make certain there are enough descriptors */ if (nsegs > txr->tx_avail - 2) { txr->no_desc_avail++; error = ENOBUFS; goto xmit_fail; } m_head = *m_headp; /* ** Set up the appropriate offload context ** this becomes the first descriptor of ** a packet. */ if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { if (ixv_tso_setup(txr, m_head, &paylen)) { cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8; olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; olinfo_status |= paylen << IXGBE_ADVTXD_PAYLEN_SHIFT; ++adapter->tso_tx; } else return (ENXIO); } else if (ixv_tx_ctx_setup(txr, m_head)) olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; /* Record payload length */ if (paylen == 0) olinfo_status |= m_head->m_pkthdr.len << IXGBE_ADVTXD_PAYLEN_SHIFT; i = txr->next_avail_desc; for (j = 0; j < nsegs; j++) { bus_size_t seglen; bus_addr_t segaddr; txbuf = &txr->tx_buffers[i]; txd = &txr->tx_base[i]; seglen = segs[j].ds_len; segaddr = htole64(segs[j].ds_addr); txd->read.buffer_addr = segaddr; txd->read.cmd_type_len = htole32(txr->txd_cmd | cmd_type_len |seglen); txd->read.olinfo_status = htole32(olinfo_status); last = i; /* descriptor that will get completion IRQ */ if (++i == adapter->num_tx_desc) i = 0; txbuf->m_head = NULL; txbuf->eop_index = -1; } txd->read.cmd_type_len |= htole32(IXGBE_TXD_CMD_EOP | IXGBE_TXD_CMD_RS); txr->tx_avail -= nsegs; txr->next_avail_desc = i; txbuf->m_head = m_head; txr->tx_buffers[first].map = txbuf->map; txbuf->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); /* Set the index of the descriptor that will be marked done */ txbuf = &txr->tx_buffers[first]; txbuf->eop_index = last; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * Advance the Transmit Descriptor Tail (Tdt), this tells the * hardware that this frame is available to transmit. */ ++txr->total_packets; IXGBE_WRITE_REG(&adapter->hw, IXGBE_VFTDT(txr->me), i); return (0); xmit_fail: bus_dmamap_unload(txr->txtag, txbuf->map); return (error); } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ #define IXGBE_RAR_ENTRIES 16 static void ixv_set_multi(struct adapter *adapter) { u8 mta[MAX_NUM_MULTICAST_ADDRESSES * IXGBE_ETH_LENGTH_OF_ADDRESS]; u8 *update_ptr; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = adapter->ifp; IOCTL_DEBUGOUT("ixv_set_multi: begin"); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * IXGBE_ETH_LENGTH_OF_ADDRESS], IXGBE_ETH_LENGTH_OF_ADDRESS); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif update_ptr = mta; ixgbe_update_mc_addr_list(&adapter->hw, update_ptr, mcnt, ixv_mc_array_itr, TRUE); return; } /* * This is an iterator function now needed by the multicast * shared code. It simply feeds the shared code routine the * addresses in the array of ixv_set_multi() one by one. */ static u8 * ixv_mc_array_itr(struct ixgbe_hw *hw, u8 **update_ptr, u32 *vmdq) { u8 *addr = *update_ptr; u8 *newptr; *vmdq = 0; newptr = addr + IXGBE_ETH_LENGTH_OF_ADDRESS; *update_ptr = newptr; return addr; } /********************************************************************* * Timer routine * * This routine checks for link status,updates statistics, * and runs the watchdog check. * **********************************************************************/ static void ixv_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; int i; mtx_assert(&adapter->core_mtx, MA_OWNED); ixv_update_link_status(adapter); /* Stats Update */ ixv_update_stats(adapter); /* * If the interface has been paused * then don't do the watchdog check */ if (IXGBE_READ_REG(&adapter->hw, IXGBE_TFCS) & IXGBE_TFCS_TXOFF) goto out; /* ** Check for time since any descriptor was cleaned */ for (i = 0; i < adapter->num_queues; i++, txr++) { IXV_TX_LOCK(txr); if (txr->watchdog_check == FALSE) { IXV_TX_UNLOCK(txr); continue; } if ((ticks - txr->watchdog_time) > IXV_WATCHDOG) goto hung; IXV_TX_UNLOCK(txr); } out: ixv_rearm_queues(adapter, adapter->que_mask); callout_reset(&adapter->timer, hz, ixv_local_timer, adapter); return; hung: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); device_printf(dev,"Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, IXGBE_READ_REG(&adapter->hw, IXGBE_VFTDH(i)), IXGBE_READ_REG(&adapter->hw, IXGBE_VFTDT(i))); device_printf(dev,"TX(%d) desc avail = %d," "Next TX to Clean = %d\n", txr->me, txr->tx_avail, txr->next_to_clean); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; IXV_TX_UNLOCK(txr); ixv_init_locked(adapter); } /* ** Note: this routine updates the OS on the link state ** the real check of the hardware only happens with ** a link interrupt. */ static void ixv_update_link_status(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; device_t dev = adapter->dev; if (adapter->link_up){ if (adapter->link_active == FALSE) { if (bootverbose) device_printf(dev,"Link is up %d Gbps %s \n", ((adapter->link_speed == 128)? 10:1), "Full Duplex"); adapter->link_active = TRUE; if_link_state_change(ifp, LINK_STATE_UP); } } else { /* Link down */ if (adapter->link_active == TRUE) { if (bootverbose) device_printf(dev,"Link is Down\n"); if_link_state_change(ifp, LINK_STATE_DOWN); adapter->link_active = FALSE; for (int i = 0; i < adapter->num_queues; i++, txr++) txr->watchdog_check = FALSE; } } return; } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void ixv_stop(void *arg) { struct ifnet *ifp; struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; ifp = adapter->ifp; mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixv_stop: begin\n"); ixv_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ixgbe_reset_hw(hw); adapter->hw.adapter_stopped = FALSE; ixgbe_stop_adapter(hw); callout_stop(&adapter->timer); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV); return; } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void ixv_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; u16 pci_cmd_word; /* ** Make sure BUSMASTER is set, on a VM under ** KVM it may not be and will break things. */ pci_enable_busmaster(dev); pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2); /* Save off the information about this board */ adapter->hw.vendor_id = pci_get_vendor(dev); adapter->hw.device_id = pci_get_device(dev); adapter->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1); adapter->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); adapter->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); return; } /********************************************************************* * * Setup MSIX Interrupt resources and handlers * **********************************************************************/ static int ixv_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; int error, rid, vector = 0; for (int i = 0; i < adapter->num_queues; i++, vector++, que++) { rid = vector + 1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev,"Unable to allocate" " bus resource: que interrupt [%d]\n", vector); return (ENXIO); } /* Set the handler function */ error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixv_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register QUE handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "que %d", i); #endif que->msix = vector; adapter->que_mask |= (u64)(1 << que->msix); /* ** Bind the msix vector, and thus the ** ring to the corresponding cpu. */ if (adapter->num_queues > 1) bus_bind_intr(dev, que->res, i); TASK_INIT(&que->que_task, 0, ixv_handle_que, que); que->tq = taskqueue_create_fast("ixv_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); } /* and Mailbox */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate" " bus resource: MBX interrupt [%d]\n", rid); return (ENXIO); } /* Set the mbx handler function */ error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixv_msix_mbx, adapter, &adapter->tag); if (error) { adapter->res = NULL; device_printf(dev, "Failed to register LINK handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "mbx"); #endif adapter->mbxvec = vector; /* Tasklets for Mailbox */ TASK_INIT(&adapter->mbx_task, 0, ixv_handle_mbx, adapter); adapter->tq = taskqueue_create_fast("ixv_mbx", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s mbxq", device_get_nameunit(adapter->dev)); /* ** Due to a broken design QEMU will fail to properly ** enable the guest for MSIX unless the vectors in ** the table are all set up, so we must rewrite the ** ENABLE in the MSIX control register again at this ** point to cause it to successfully initialize us. */ if (adapter->hw.mac.type == ixgbe_mac_82599_vf) { int msix_ctrl; pci_find_cap(dev, PCIY_MSIX, &rid); rid += PCIR_MSIX_CTRL; msix_ctrl = pci_read_config(dev, rid, 2); msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE; pci_write_config(dev, rid, msix_ctrl, 2); } return (0); } /* * Setup MSIX resources, note that the VF * device MUST use MSIX, there is no fallback. */ static int ixv_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int rid, want; /* First try MSI/X */ rid = PCIR_BAR(3); adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->msix_mem == NULL) { device_printf(adapter->dev, "Unable to map MSIX table \n"); goto out; } /* ** Want two vectors: one for a queue, ** plus an additional for mailbox. */ want = 2; if ((pci_alloc_msix(dev, &want) == 0) && (want == 2)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", want); return (want); } /* Release in case alloc was insufficient */ pci_release_msi(dev); out: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, rid, adapter->msix_mem); adapter->msix_mem = NULL; } device_printf(adapter->dev,"MSIX config error\n"); return (ENXIO); } static int ixv_allocate_pci_resources(struct adapter *adapter) { int rid; device_t dev = adapter->dev; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!(adapter->pci_mem)) { device_printf(dev,"Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); adapter->hw.hw_addr = (u8 *) &adapter->osdep.mem_bus_space_handle; adapter->num_queues = 1; adapter->hw.back = &adapter->osdep; /* ** Now setup MSI/X, should ** return us the number of ** configured vectors. */ adapter->msix = ixv_setup_msix(adapter); if (adapter->msix == ENXIO) return (ENXIO); else return (0); } static void ixv_free_pci_resources(struct adapter * adapter) { struct ix_queue *que = adapter->queues; device_t dev = adapter->dev; int rid, memrid; memrid = PCIR_BAR(MSIX_BAR); /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* ** Release all msix queue resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->mbxvec) /* we are doing MSIX */ rid = adapter->mbxvec + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); if (adapter->tag != NULL) { bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, memrid, adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static void ixv_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("ixv_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) panic("%s: can not if_alloc()\n", device_get_nameunit(dev)); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = 1000000000; ifp->if_init = ixv_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ixv_ioctl; #if __FreeBSD_version >= 800000 ifp->if_transmit = ixv_mq_start; ifp->if_qflush = ixv_qflush; #else ifp->if_start = ixv_start; #endif ifp->if_snd.ifq_maxlen = adapter->num_tx_desc - 2; ether_ifattach(ifp, adapter->hw.mac.addr); adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; /* * Tell the upper layer(s) we support long frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_TSO4 | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; ifp->if_capenable = ifp->if_capabilities; /* Don't enable LRO by default */ ifp->if_capabilities |= IFCAP_LRO; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, ixv_media_change, ixv_media_status); ifmedia_add(&adapter->media, IFM_ETHER | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return; } static void ixv_config_link(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 autoneg, err = 0; if (hw->mac.ops.check_link) err = hw->mac.ops.check_link(hw, &autoneg, &adapter->link_up, FALSE); if (err) goto out; if (hw->mac.ops.setup_link) err = hw->mac.ops.setup_link(hw, autoneg, adapter->link_up); out: return; } /******************************************************************** * Manage DMA'able memory. *******************************************************************/ static void ixv_dmamap_cb(void *arg, bus_dma_segment_t * segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs->ds_addr; return; } static int ixv_dma_malloc(struct adapter *adapter, bus_size_t size, struct ixv_dma_alloc *dma, int mapflags) { device_t dev = adapter->dev; int r; r = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &dma->dma_tag); if (r != 0) { device_printf(dev,"ixv_dma_malloc: bus_dma_tag_create failed; " "error %u\n", r); goto fail_0; } r = bus_dmamem_alloc(dma->dma_tag, (void **)&dma->dma_vaddr, BUS_DMA_NOWAIT, &dma->dma_map); if (r != 0) { device_printf(dev,"ixv_dma_malloc: bus_dmamem_alloc failed; " "error %u\n", r); goto fail_1; } r = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, ixv_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (r != 0) { device_printf(dev,"ixv_dma_malloc: bus_dmamap_load failed; " "error %u\n", r); goto fail_2; } dma->dma_size = size; return (0); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); fail_1: bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_tag = NULL; return (r); } static void ixv_dma_free(struct adapter *adapter, struct ixv_dma_alloc *dma) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); } /********************************************************************* * * Allocate memory for the transmit and receive rings, and then * the descriptors associated with each, called only once at attach. * **********************************************************************/ static int ixv_allocate_queues(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que; struct tx_ring *txr; struct rx_ring *rxr; int rsize, tsize, error = 0; int txconf = 0, rxconf = 0; /* First allocate the top level queue structs */ if (!(adapter->queues = (struct ix_queue *) malloc(sizeof(struct ix_queue) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate queue memory\n"); error = ENOMEM; goto fail; } /* First allocate the TX ring struct memory */ if (!(adapter->tx_rings = (struct tx_ring *) malloc(sizeof(struct tx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); error = ENOMEM; goto tx_fail; } /* Next allocate the RX */ if (!(adapter->rx_rings = (struct rx_ring *) malloc(sizeof(struct rx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); error = ENOMEM; goto rx_fail; } /* For the ring itself */ tsize = roundup2(adapter->num_tx_desc * sizeof(union ixgbe_adv_tx_desc), DBA_ALIGN); /* * Now set up the TX queues, txconf is needed to handle the * possibility that things fail midcourse and we need to * undo memory gracefully */ for (int i = 0; i < adapter->num_queues; i++, txconf++) { /* Set up some basics */ txr = &adapter->tx_rings[i]; txr->adapter = adapter; txr->me = i; /* Initialize the TX side lock */ snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF); if (ixv_dma_malloc(adapter, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate TX Descriptor memory\n"); error = ENOMEM; goto err_tx_desc; } txr->tx_base = (union ixgbe_adv_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); /* Now allocate transmit buffers for the ring */ if (ixv_allocate_transmit_buffers(txr)) { device_printf(dev, "Critical Failure setting up transmit buffers\n"); error = ENOMEM; goto err_tx_desc; } #if __FreeBSD_version >= 800000 /* Allocate a buf ring */ txr->br = buf_ring_alloc(IXV_BR_SIZE, M_DEVBUF, M_WAITOK, &txr->tx_mtx); if (txr->br == NULL) { device_printf(dev, "Critical Failure setting up buf ring\n"); error = ENOMEM; goto err_tx_desc; } #endif } /* * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; /* Set up some basics */ rxr->adapter = adapter; rxr->me = i; /* Initialize the RX side lock */ snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)", device_get_nameunit(dev), rxr->me); mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF); if (ixv_dma_malloc(adapter, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate RxDescriptor memory\n"); error = ENOMEM; goto err_rx_desc; } rxr->rx_base = (union ixgbe_adv_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ if (ixv_allocate_receive_buffers(rxr)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); error = ENOMEM; goto err_rx_desc; } } /* ** Finally set up the queue holding structs */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; que->adapter = adapter; que->txr = &adapter->tx_rings[i]; que->rxr = &adapter->rx_rings[i]; } return (0); err_rx_desc: for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--) ixv_dma_free(adapter, &rxr->rxdma); err_tx_desc: for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--) ixv_dma_free(adapter, &txr->txdma); free(adapter->rx_rings, M_DEVBUF); rx_fail: free(adapter->tx_rings, M_DEVBUF); tx_fail: free(adapter->queues, M_DEVBUF); fail: return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int ixv_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; struct ixv_tx_buf *txbuf; int error, i; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create( bus_get_dma_tag(adapter->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IXV_TSO_SIZE, /* maxsize */ 32, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->txtag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } if (!(txr->tx_buffers = (struct ixv_tx_buf *) malloc(sizeof(struct ixv_tx_buf) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { error = bus_dmamap_create(txr->txtag, 0, &txbuf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } return 0; fail: /* We free all, it handles case where we are in the middle */ ixv_free_transmit_structures(adapter); return (error); } /********************************************************************* * * Initialize a transmit ring. * **********************************************************************/ static void ixv_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ixv_tx_buf *txbuf; int i; /* Clear the old ring contents */ IXV_TX_LOCK(txr); bzero((void *)txr->tx_base, (sizeof(union ixgbe_adv_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ txr->next_avail_desc = 0; txr->next_to_clean = 0; /* Free any existing tx buffers. */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; } /* Clear the EOP index */ txbuf->eop_index = -1; } /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IXV_TX_UNLOCK(txr); } /********************************************************************* * * Initialize all transmit rings. * **********************************************************************/ static int ixv_setup_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) ixv_setup_transmit_ring(txr); return (0); } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void ixv_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 tdba = txr->txdma.dma_paddr; u32 txctrl, txdctl; /* Set WTHRESH to 8, burst writeback */ txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i)); txdctl |= (8 << 16); IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(i), txdctl); /* Now enable */ txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i)); txdctl |= IXGBE_TXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(i), txdctl); /* Set the HW Tx Head and Tail indices */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_VFTDH(i), 0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_VFTDT(i), 0); /* Setup Transmit Descriptor Cmd Settings */ txr->txd_cmd = IXGBE_TXD_CMD_IFCS; txr->watchdog_check = FALSE; /* Set Ring parameters */ IXGBE_WRITE_REG(hw, IXGBE_VFTDBAL(i), (tdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_VFTDBAH(i), (tdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_VFTDLEN(i), adapter->num_tx_desc * sizeof(struct ixgbe_legacy_tx_desc)); txctrl = IXGBE_READ_REG(hw, IXGBE_VFDCA_TXCTRL(i)); txctrl &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; IXGBE_WRITE_REG(hw, IXGBE_VFDCA_TXCTRL(i), txctrl); break; } return; } /********************************************************************* * * Free all transmit rings. * **********************************************************************/ static void ixv_free_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) { IXV_TX_LOCK(txr); ixv_free_transmit_buffers(txr); ixv_dma_free(adapter, &txr->txdma); IXV_TX_UNLOCK(txr); IXV_TX_LOCK_DESTROY(txr); } free(adapter->tx_rings, M_DEVBUF); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ static void ixv_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ixv_tx_buf *tx_buffer; int i; INIT_DEBUGOUT("free_transmit_ring: begin"); if (txr->tx_buffers == NULL) return; tx_buffer = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) { if (tx_buffer->m_head != NULL) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; if (tx_buffer->map != NULL) { bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } else if (tx_buffer->map != NULL) { bus_dmamap_unload(txr->txtag, tx_buffer->map); bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } #if __FreeBSD_version >= 800000 if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); #endif if (txr->tx_buffers != NULL) { free(txr->tx_buffers, M_DEVBUF); txr->tx_buffers = NULL; } if (txr->txtag != NULL) { bus_dma_tag_destroy(txr->txtag); txr->txtag = NULL; } return; } /********************************************************************* * * Advanced Context Descriptor setup for VLAN or CSUM * **********************************************************************/ static bool ixv_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp) { struct adapter *adapter = txr->adapter; struct ixgbe_adv_tx_context_desc *TXD; struct ixv_tx_buf *tx_buffer; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; struct ether_vlan_header *eh; struct ip *ip; struct ip6_hdr *ip6; int ehdrlen, ip_hlen = 0; u16 etype; u8 ipproto = 0; bool offload = TRUE; int ctxd = txr->next_avail_desc; u16 vtag = 0; if ((mp->m_pkthdr.csum_flags & CSUM_OFFLOAD) == 0) offload = FALSE; tx_buffer = &txr->tx_buffers[ctxd]; TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd]; /* ** In advanced descriptors the vlan tag must ** be placed into the descriptor itself. */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT); } else if (offload == FALSE) return FALSE; /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } /* Set the ether header length */ vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT; switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; if (mp->m_len < ehdrlen + ip_hlen) return (FALSE); ipproto = ip->ip_p; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); if (mp->m_len < ehdrlen + ip_hlen) return (FALSE); ipproto = ip6->ip6_nxt; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; break; default: offload = FALSE; break; } vlan_macip_lens |= ip_hlen; type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; switch (ipproto) { case IPPROTO_TCP: if (mp->m_pkthdr.csum_flags & CSUM_TCP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & CSUM_UDP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP; break; #if __FreeBSD_version >= 800000 case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & CSUM_SCTP) type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP; break; #endif default: offload = FALSE; break; } /* Now copy bits into descriptor */ TXD->vlan_macip_lens |= htole32(vlan_macip_lens); TXD->type_tucmd_mlhl |= htole32(type_tucmd_mlhl); TXD->seqnum_seed = htole32(0); TXD->mss_l4len_idx = htole32(0); tx_buffer->m_head = NULL; tx_buffer->eop_index = -1; /* We've consumed the first desc, adjust counters */ if (++ctxd == adapter->num_tx_desc) ctxd = 0; txr->next_avail_desc = ctxd; --txr->tx_avail; return (offload); } /********************************************************************** * * Setup work for hardware segmentation offload (TSO) on * adapters using advanced tx descriptors * **********************************************************************/ static bool ixv_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *paylen) { struct adapter *adapter = txr->adapter; struct ixgbe_adv_tx_context_desc *TXD; struct ixv_tx_buf *tx_buffer; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; u32 mss_l4len_idx = 0; u16 vtag = 0; int ctxd, ehdrlen, hdrlen, ip_hlen, tcp_hlen; struct ether_vlan_header *eh; struct ip *ip; struct tcphdr *th; /* * Determine where frame payload starts. * Jump over vlan headers if already present */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehdrlen = ETHER_HDR_LEN; /* Ensure we have at least the IP+TCP header in the first mbuf. */ if (mp->m_len < ehdrlen + sizeof(struct ip) + sizeof(struct tcphdr)) return FALSE; ctxd = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[ctxd]; TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd]; ip = (struct ip *)(mp->m_data + ehdrlen); if (ip->ip_p != IPPROTO_TCP) return FALSE; /* 0 */ ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); tcp_hlen = th->th_off << 2; hdrlen = ehdrlen + ip_hlen + tcp_hlen; /* This is used in the transmit desc in encap */ *paylen = mp->m_pkthdr.len - hdrlen; /* VLAN MACLEN IPLEN */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT); } vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= ip_hlen; TXD->vlan_macip_lens |= htole32(vlan_macip_lens); /* ADV DTYPE TUCMD */ type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; TXD->type_tucmd_mlhl |= htole32(type_tucmd_mlhl); /* MSS L4LEN IDX */ mss_l4len_idx |= (mp->m_pkthdr.tso_segsz << IXGBE_ADVTXD_MSS_SHIFT); mss_l4len_idx |= (tcp_hlen << IXGBE_ADVTXD_L4LEN_SHIFT); TXD->mss_l4len_idx = htole32(mss_l4len_idx); TXD->seqnum_seed = htole32(0); tx_buffer->m_head = NULL; tx_buffer->eop_index = -1; if (++ctxd == adapter->num_tx_desc) ctxd = 0; txr->tx_avail--; txr->next_avail_desc = ctxd; return TRUE; } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * **********************************************************************/ static bool ixv_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; u32 first, last, done; struct ixv_tx_buf *tx_buffer; struct ixgbe_legacy_tx_desc *tx_desc, *eop_desc; mtx_assert(&txr->tx_mtx, MA_OWNED); if (txr->tx_avail == adapter->num_tx_desc) return FALSE; first = txr->next_to_clean; tx_buffer = &txr->tx_buffers[first]; /* For cleanup we just use legacy struct */ tx_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[first]; last = tx_buffer->eop_index; if (last == -1) return FALSE; eop_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last]; /* ** Get the index of the first descriptor ** BEYOND the EOP and call that 'done'. ** I do this so the comparison in the ** inner while loop below can be simple */ if (++last == adapter->num_tx_desc) last = 0; done = last; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* ** Only the EOP descriptor of a packet now has the DD ** bit set, this is what we look for... */ while (eop_desc->upper.fields.status & IXGBE_TXD_STAT_DD) { /* We clean the range of the packet */ while (first != done) { tx_desc->upper.data = 0; tx_desc->lower.data = 0; tx_desc->buffer_addr = 0; ++txr->tx_avail; if (tx_buffer->m_head) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; tx_buffer->map = NULL; } tx_buffer->eop_index = -1; txr->watchdog_time = ticks; if (++first == adapter->num_tx_desc) first = 0; tx_buffer = &txr->tx_buffers[first]; tx_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[first]; } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* See if there is more work now */ last = tx_buffer->eop_index; if (last != -1) { eop_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last]; /* Get next done point */ if (++last == adapter->num_tx_desc) last = 0; done = last; } else break; } bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); txr->next_to_clean = first; /* * If we have enough room, clear IFF_DRV_OACTIVE to tell the stack that * it is OK to send packets. If there are no pending descriptors, * clear the timeout. Otherwise, if some descriptors have been freed, * restart the timeout. */ if (txr->tx_avail > IXV_TX_CLEANUP_THRESHOLD) { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (txr->tx_avail == adapter->num_tx_desc) { txr->watchdog_check = FALSE; return FALSE; } } return TRUE; } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void ixv_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; bus_dma_segment_t hseg[1]; bus_dma_segment_t pseg[1]; struct ixv_rx_buf *rxbuf; struct mbuf *mh, *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_to_refresh; /* Get the control variable, one beyond refresh point */ if (++j == adapter->num_rx_desc) j = 0; while (j != limit) { rxbuf = &rxr->rx_buffers[i]; if ((rxbuf->m_head == NULL) && (rxr->hdr_split)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) goto update; mh->m_pkthdr.len = mh->m_len = MHLEN; mh->m_len = MHLEN; mh->m_flags |= M_PKTHDR; m_adj(mh, ETHER_ALIGN); /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("GET BUF: dmamap load" " failure - %d\n", error); m_free(mh); goto update; } rxbuf->m_head = mh; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.hdr_addr = htole64(hseg[0].ds_addr); } if (rxbuf->m_pack == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (mp == NULL) goto update; } else mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("GET BUF: dmamap load" " failure - %d\n", error); m_free(mp); rxbuf->m_pack = NULL; goto update; } rxbuf->m_pack = mp; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.pkt_addr = htole64(pseg[0].ds_addr); refreshed = TRUE; rxr->next_to_refresh = i = j; /* Calculate next index */ if (++j == adapter->num_rx_desc) j = 0; } update: if (refreshed) /* update tail index */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_VFRDT(rxr->me), rxr->next_to_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int ixv_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; struct ixv_rx_buf *rxbuf; int i, bsize, error; bsize = sizeof(struct ixv_rx_buf) * adapter->num_rx_desc; if (!(rxr->rx_buffers = (struct ixv_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MSIZE, /* maxsize */ 1, /* nsegments */ MSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->htag))) { device_printf(dev, "Unable to create RX DMA tag\n"); goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUMPAGESIZE, /* maxsize */ 1, /* nsegments */ MJUMPAGESIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX DMA tag\n"); goto fail; } for (i = 0; i < adapter->num_rx_desc; i++, rxbuf++) { rxbuf = &rxr->rx_buffers[i]; error = bus_dmamap_create(rxr->htag, BUS_DMA_NOWAIT, &rxbuf->hmap); if (error) { device_printf(dev, "Unable to create RX head map\n"); goto fail; } error = bus_dmamap_create(rxr->ptag, BUS_DMA_NOWAIT, &rxbuf->pmap); if (error) { device_printf(dev, "Unable to create RX pkt map\n"); goto fail; } } return (0); fail: /* Frees all, but can handle partial completion */ ixv_free_receive_structures(adapter); return (error); } static void ixv_free_receive_ring(struct rx_ring *rxr) { struct adapter *adapter; struct ixv_rx_buf *rxbuf; int i; adapter = rxr->adapter; for (i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; } } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int ixv_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter; struct ifnet *ifp; device_t dev; struct ixv_rx_buf *rxbuf; bus_dma_segment_t pseg[1], hseg[1]; struct lro_ctrl *lro = &rxr->lro; int rsize, nsegs, error = 0; adapter = rxr->adapter; ifp = adapter->ifp; dev = adapter->dev; /* Clear the ring contents */ IXV_RX_LOCK(rxr); rsize = roundup2(adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); /* Free current RX buffer structs and their mbufs */ ixv_free_receive_ring(rxr); /* Configure header split? */ if (ixv_header_split) rxr->hdr_split = TRUE; /* Now replenish the mbufs */ for (int j = 0; j != adapter->num_rx_desc; ++j) { struct mbuf *mh, *mp; rxbuf = &rxr->rx_buffers[j]; /* ** Dont allocate mbufs if not ** doing header split, its wasteful */ if (rxr->hdr_split == FALSE) goto skip_head; /* First the header */ rxbuf->m_head = m_gethdr(M_NOWAIT, MT_DATA); if (rxbuf->m_head == NULL) { error = ENOBUFS; goto fail; } m_adj(rxbuf->m_head, ETHER_ALIGN); mh = rxbuf->m_head; mh->m_len = mh->m_pkthdr.len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, rxbuf->m_head, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) /* Nothing elegant to do here */ goto fail; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.hdr_addr = htole64(hseg[0].ds_addr); skip_head: /* Now the payload cluster */ rxbuf->m_pack = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->m_pack == NULL) { error = ENOBUFS; goto fail; } mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(pseg[0].ds_addr); } /* Setup our descriptor indices */ rxr->next_to_check = 0; rxr->next_to_refresh = 0; rxr->lro_enabled = FALSE; rxr->rx_split_packets = 0; rxr->rx_bytes = 0; rxr->discard = FALSE; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* ** Now set up the LRO interface: */ if (ifp->if_capenable & IFCAP_LRO) { int err = tcp_lro_init(lro); if (err) { device_printf(dev, "LRO Initialization failed!\n"); goto fail; } INIT_DEBUGOUT("RX Soft LRO Initialized\n"); rxr->lro_enabled = TRUE; lro->ifp = adapter->ifp; } IXV_RX_UNLOCK(rxr); return (0); fail: ixv_free_receive_ring(rxr); IXV_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int ixv_setup_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; int j; for (j = 0; j < adapter->num_queues; j++, rxr++) if (ixv_setup_receive_ring(rxr)) goto fail; return (0); fail: /* * Free RX buffers allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'j' failed, so its the terminus. */ for (int i = 0; i < j; ++i) { rxr = &adapter->rx_rings[i]; ixv_free_receive_ring(rxr); } return (ENOBUFS); } /********************************************************************* * * Setup receive registers and features. * **********************************************************************/ #define IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT 2 static void ixv_initialize_receive_units(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 bufsz, fctrl, rxcsum, hlreg; /* Enable broadcasts */ fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL); fctrl |= IXGBE_FCTRL_BAM; fctrl |= IXGBE_FCTRL_DPF; fctrl |= IXGBE_FCTRL_PMCF; IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl); /* Set for Jumbo Frames? */ hlreg = IXGBE_READ_REG(hw, IXGBE_HLREG0); if (ifp->if_mtu > ETHERMTU) { hlreg |= IXGBE_HLREG0_JUMBOEN; bufsz = 4096 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; } else { hlreg &= ~IXGBE_HLREG0_JUMBOEN; bufsz = 2048 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; } IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg); for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 rdba = rxr->rxdma.dma_paddr; u32 reg, rxdctl; /* Do the queue enabling first */ rxdctl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)); rxdctl |= IXGBE_RXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(i), rxdctl); for (int k = 0; k < 10; k++) { if (IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)) & IXGBE_RXDCTL_ENABLE) break; else msec_delay(1); } wmb(); /* Setup the Base and Length of the Rx Descriptor Ring */ IXGBE_WRITE_REG(hw, IXGBE_VFRDBAL(i), (rdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_VFRDBAH(i), (rdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_VFRDLEN(i), adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc)); /* Set up the SRRCTL register */ reg = IXGBE_READ_REG(hw, IXGBE_VFSRRCTL(i)); reg &= ~IXGBE_SRRCTL_BSIZEHDR_MASK; reg &= ~IXGBE_SRRCTL_BSIZEPKT_MASK; reg |= bufsz; if (rxr->hdr_split) { /* Use a standard mbuf for the header */ reg |= ((IXV_RX_HDR << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT) & IXGBE_SRRCTL_BSIZEHDR_MASK); reg |= IXGBE_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; } else reg |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; IXGBE_WRITE_REG(hw, IXGBE_VFSRRCTL(i), reg); /* Setup the HW Rx Head and Tail Descriptor Pointers */ IXGBE_WRITE_REG(hw, IXGBE_VFRDH(rxr->me), 0); IXGBE_WRITE_REG(hw, IXGBE_VFRDT(rxr->me), adapter->num_rx_desc - 1); } rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM); if (ifp->if_capenable & IFCAP_RXCSUM) rxcsum |= IXGBE_RXCSUM_PCSD; if (!(rxcsum & IXGBE_RXCSUM_PCSD)) rxcsum |= IXGBE_RXCSUM_IPPCSE; IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum); return; } /********************************************************************* * * Free all receive rings. * **********************************************************************/ static void ixv_free_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; for (int i = 0; i < adapter->num_queues; i++, rxr++) { struct lro_ctrl *lro = &rxr->lro; ixv_free_receive_buffers(rxr); /* Free LRO memory */ tcp_lro_free(lro); /* Free the ring memory as well */ ixv_dma_free(adapter, &rxr->rxdma); } free(adapter->rx_rings, M_DEVBUF); } /********************************************************************* * * Free receive ring data structures * **********************************************************************/ static void ixv_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct ixv_rx_buf *rxbuf; INIT_DEBUGOUT("free_receive_structures: begin"); /* Cleanup any existing buffers */ if (rxr->rx_buffers != NULL) { for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; if (rxbuf->hmap != NULL) { bus_dmamap_destroy(rxr->htag, rxbuf->hmap); rxbuf->hmap = NULL; } if (rxbuf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, rxbuf->pmap); rxbuf->pmap = NULL; } } if (rxr->rx_buffers != NULL) { free(rxr->rx_buffers, M_DEVBUF); rxr->rx_buffers = NULL; } } if (rxr->htag != NULL) { bus_dma_tag_destroy(rxr->htag); rxr->htag = NULL; } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } return; } static __inline void ixv_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u32 ptype) { /* * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP)) == (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP) && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } IXV_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IXV_RX_LOCK(rxr); } static __inline void ixv_rx_discard(struct rx_ring *rxr, int i) { struct ixv_rx_buf *rbuf; rbuf = &rxr->rx_buffers[i]; if (rbuf->fmp != NULL) {/* Partial chain ? */ rbuf->fmp->m_flags |= M_PKTHDR; m_freem(rbuf->fmp); rbuf->fmp = NULL; } /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->m_head) { m_free(rbuf->m_head); rbuf->m_head = NULL; } if (rbuf->m_pack) { m_free(rbuf->m_pack); rbuf->m_pack = NULL; } return; } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE for more work, FALSE for all clean. *********************************************************************/ static bool ixv_rxeof(struct ix_queue *que, int count) { struct adapter *adapter = que->adapter; struct rx_ring *rxr = que->rxr; struct ifnet *ifp = adapter->ifp; struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; int i, nextp, processed = 0; u32 staterr = 0; union ixgbe_adv_rx_desc *cur; struct ixv_rx_buf *rbuf, *nbuf; IXV_RX_LOCK(rxr); for (i = rxr->next_to_check; count != 0;) { struct mbuf *sendmp, *mh, *mp; u32 rsc, ptype; u16 hlen, plen, hdr, vtag; bool eop; /* Sync the ring. */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); cur = &rxr->rx_base[i]; staterr = le32toh(cur->wb.upper.status_error); if ((staterr & IXGBE_RXD_STAT_DD) == 0) break; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = NULL; nbuf = NULL; rsc = 0; cur->wb.upper.status_error = 0; rbuf = &rxr->rx_buffers[i]; mh = rbuf->m_head; mp = rbuf->m_pack; plen = le16toh(cur->wb.upper.length); ptype = le32toh(cur->wb.lower.lo_dword.data) & IXGBE_RXDADV_PKTTYPE_MASK; hdr = le16toh(cur->wb.lower.lo_dword.hs_rss.hdr_info); vtag = le16toh(cur->wb.upper.vlan); eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0); /* Make sure all parts of a bad packet are discarded */ if (((staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) != 0) || (rxr->discard)) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); rxr->rx_discarded++; if (!eop) rxr->discard = TRUE; else rxr->discard = FALSE; ixv_rx_discard(rxr, i); goto next_desc; } if (!eop) { nextp = i + 1; if (nextp == adapter->num_rx_desc) nextp = 0; nbuf = &rxr->rx_buffers[nextp]; prefetch(nbuf); } /* ** The header mbuf is ONLY used when header ** split is enabled, otherwise we get normal ** behavior, ie, both header and payload ** are DMA'd into the payload buffer. ** ** Rather than using the fmp/lmp global pointers ** we now keep the head of a packet chain in the ** buffer struct and pass this along from one ** descriptor to the next, until we get EOP. */ if (rxr->hdr_split && (rbuf->fmp == NULL)) { /* This must be an initial descriptor */ hlen = (hdr & IXGBE_RXDADV_HDRBUFLEN_MASK) >> IXGBE_RXDADV_HDRBUFLEN_SHIFT; if (hlen > IXV_RX_HDR) hlen = IXV_RX_HDR; mh->m_len = hlen; mh->m_flags |= M_PKTHDR; mh->m_next = NULL; mh->m_pkthdr.len = mh->m_len; /* Null buf pointer so it is refreshed */ rbuf->m_head = NULL; /* ** Check the payload length, this ** could be zero if its a small ** packet. */ if (plen > 0) { mp->m_len = plen; mp->m_next = NULL; mp->m_flags &= ~M_PKTHDR; mh->m_next = mp; mh->m_pkthdr.len += mp->m_len; /* Null buf pointer so it is refreshed */ rbuf->m_pack = NULL; rxr->rx_split_packets++; } /* ** Now create the forward ** chain so when complete ** we wont have to. */ if (eop == 0) { /* stash the chain head */ nbuf->fmp = mh; /* Make forward chain */ if (plen) mp->m_next = nbuf->m_pack; else mh->m_next = nbuf->m_pack; } else { /* Singlet, prepare to send */ sendmp = mh; if ((adapter->num_vlans) && (staterr & IXGBE_RXD_STAT_VP)) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } } } else { /* ** Either no header split, or a ** secondary piece of a fragmented ** split packet. */ mp->m_len = plen; /* ** See if there is a stored head ** that determines what we are */ sendmp = rbuf->fmp; rbuf->m_pack = rbuf->fmp = NULL; if (sendmp != NULL) /* secondary frag */ sendmp->m_pkthdr.len += mp->m_len; else { /* first desc of a non-ps chain */ sendmp = mp; sendmp->m_flags |= M_PKTHDR; sendmp->m_pkthdr.len = mp->m_len; if (staterr & IXGBE_RXD_STAT_VP) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } } /* Pass the head pointer on */ if (eop == 0) { nbuf->fmp = sendmp; sendmp = NULL; mp->m_next = nbuf->m_pack; } } ++processed; /* Sending this frame? */ if (eop) { sendmp->m_pkthdr.rcvif = ifp; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); rxr->rx_packets++; /* capture data for AIM */ rxr->bytes += sendmp->m_pkthdr.len; rxr->rx_bytes += sendmp->m_pkthdr.len; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) ixv_rx_checksum(staterr, sendmp, ptype); #if __FreeBSD_version >= 800000 sendmp->m_pkthdr.flowid = que->msix; - sendmp->m_flags |= M_FLOWID; + M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE); #endif } next_desc: bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == adapter->num_rx_desc) i = 0; /* Now send to the stack or do LRO */ if (sendmp != NULL) ixv_rx_input(rxr, ifp, sendmp, ptype); /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { ixv_refresh_mbufs(rxr, i); processed = 0; } } /* Refresh any remaining buf structs */ if (ixv_rx_unrefreshed(rxr)) ixv_refresh_mbufs(rxr, i); rxr->next_to_check = i; /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } IXV_RX_UNLOCK(rxr); /* ** We still have cleaning to do? ** Schedule another interrupt if so. */ if ((staterr & IXGBE_RXD_STAT_DD) != 0) { ixv_rearm_queues(adapter, (u64)(1 << que->msix)); return (TRUE); } return (FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void ixv_rx_checksum(u32 staterr, struct mbuf * mp, u32 ptype) { u16 status = (u16) staterr; u8 errors = (u8) (staterr >> 24); bool sctp = FALSE; if ((ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & IXGBE_RXDADV_PKTTYPE_SCTP) != 0) sctp = TRUE; if (status & IXGBE_RXD_STAT_IPCS) { if (!(errors & IXGBE_RXD_ERR_IPE)) { /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } else mp->m_pkthdr.csum_flags = 0; } if (status & IXGBE_RXD_STAT_L4CS) { u64 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); #if __FreeBSD_version >= 800000 if (sctp) type = CSUM_SCTP_VALID; #endif if (!(errors & IXGBE_RXD_ERR_TCPE)) { mp->m_pkthdr.csum_flags |= type; if (!sctp) mp->m_pkthdr.csum_data = htons(0xffff); } } return; } static void ixv_setup_vlan_support(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 ctrl, vid, vfta, retry; /* ** We get here thru init_locked, meaning ** a soft reset, this has already cleared ** the VFTA and other state, so if there ** have been no vlan's registered do nothing. */ if (adapter->num_vlans == 0) return; /* Enable the queues */ for (int i = 0; i < adapter->num_queues; i++) { ctrl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)); ctrl |= IXGBE_RXDCTL_VME; IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(i), ctrl); } /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < VFTA_SIZE; i++) { if (ixv_shadow_vfta[i] == 0) continue; vfta = ixv_shadow_vfta[i]; /* ** Reconstruct the vlan id's ** based on the bits set in each ** of the array ints. */ for ( int j = 0; j < 32; j++) { retry = 0; if ((vfta & (1 << j)) == 0) continue; vid = (i * 32) + j; /* Call the shared code mailbox routine */ while (ixgbe_set_vfta(hw, vid, 0, TRUE)) { if (++retry > 5) break; } } } } /* ** This routine is run via an vlan config EVENT, ** it enables us to use the HW Filter table since ** we can get the vlan id. This just creates the ** entry in the soft version of the VFTA, init will ** repopulate the real table. */ static void ixv_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXV_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; ixv_shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; /* Re-init to load the changes */ ixv_init_locked(adapter); IXV_CORE_UNLOCK(adapter); } /* ** This routine is run via an vlan ** unconfig EVENT, remove our entry ** in the soft vfta. */ static void ixv_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXV_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; ixv_shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Re-init to load the changes */ ixv_init_locked(adapter); IXV_CORE_UNLOCK(adapter); } static void ixv_enable_intr(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ix_queue *que = adapter->queues; u32 mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE); IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask); mask = IXGBE_EIMS_ENABLE_MASK; mask &= ~(IXGBE_EIMS_OTHER | IXGBE_EIMS_LSC); IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, mask); for (int i = 0; i < adapter->num_queues; i++, que++) ixv_enable_queue(adapter, que->msix); IXGBE_WRITE_FLUSH(hw); return; } static void ixv_disable_intr(struct adapter *adapter) { IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEIAC, 0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEIMC, ~0); IXGBE_WRITE_FLUSH(&adapter->hw); return; } /* ** Setup the correct IVAR register for a particular MSIX interrupt ** - entry is the register array entry ** - vector is the MSIX vector for this queue ** - type is RX/TX/MISC */ static void ixv_set_ivar(struct adapter *adapter, u8 entry, u8 vector, s8 type) { struct ixgbe_hw *hw = &adapter->hw; u32 ivar, index; vector |= IXGBE_IVAR_ALLOC_VAL; if (type == -1) { /* MISC IVAR */ ivar = IXGBE_READ_REG(hw, IXGBE_VTIVAR_MISC); ivar &= ~0xFF; ivar |= vector; IXGBE_WRITE_REG(hw, IXGBE_VTIVAR_MISC, ivar); } else { /* RX/TX IVARS */ index = (16 * (entry & 1)) + (8 * type); ivar = IXGBE_READ_REG(hw, IXGBE_VTIVAR(entry >> 1)); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_VTIVAR(entry >> 1), ivar); } } static void ixv_configure_ivars(struct adapter *adapter) { struct ix_queue *que = adapter->queues; for (int i = 0; i < adapter->num_queues; i++, que++) { /* First the RX queue entry */ ixv_set_ivar(adapter, i, que->msix, 0); /* ... and the TX */ ixv_set_ivar(adapter, i, que->msix, 1); /* Set an initial value in EITR */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEITR(que->msix), IXV_EITR_DEFAULT); } /* For the Link interrupt */ ixv_set_ivar(adapter, 1, adapter->mbxvec, -1); } /* ** Tasklet handler for MSIX MBX interrupts ** - do outside interrupt since it might sleep */ static void ixv_handle_mbx(void *context, int pending) { struct adapter *adapter = context; ixgbe_check_link(&adapter->hw, &adapter->link_speed, &adapter->link_up, 0); ixv_update_link_status(adapter); } /* ** The VF stats registers never have a truely virgin ** starting point, so this routine tries to make an ** artificial one, marking ground zero on attach as ** it were. */ static void ixv_save_stats(struct adapter *adapter) { if (adapter->stats.vfgprc || adapter->stats.vfgptc) { adapter->stats.saved_reset_vfgprc += adapter->stats.vfgprc - adapter->stats.base_vfgprc; adapter->stats.saved_reset_vfgptc += adapter->stats.vfgptc - adapter->stats.base_vfgptc; adapter->stats.saved_reset_vfgorc += adapter->stats.vfgorc - adapter->stats.base_vfgorc; adapter->stats.saved_reset_vfgotc += adapter->stats.vfgotc - adapter->stats.base_vfgotc; adapter->stats.saved_reset_vfmprc += adapter->stats.vfmprc - adapter->stats.base_vfmprc; } } static void ixv_init_stats(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; adapter->stats.last_vfgprc = IXGBE_READ_REG(hw, IXGBE_VFGPRC); adapter->stats.last_vfgorc = IXGBE_READ_REG(hw, IXGBE_VFGORC_LSB); adapter->stats.last_vfgorc |= (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGORC_MSB))) << 32); adapter->stats.last_vfgptc = IXGBE_READ_REG(hw, IXGBE_VFGPTC); adapter->stats.last_vfgotc = IXGBE_READ_REG(hw, IXGBE_VFGOTC_LSB); adapter->stats.last_vfgotc |= (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGOTC_MSB))) << 32); adapter->stats.last_vfmprc = IXGBE_READ_REG(hw, IXGBE_VFMPRC); adapter->stats.base_vfgprc = adapter->stats.last_vfgprc; adapter->stats.base_vfgorc = adapter->stats.last_vfgorc; adapter->stats.base_vfgptc = adapter->stats.last_vfgptc; adapter->stats.base_vfgotc = adapter->stats.last_vfgotc; adapter->stats.base_vfmprc = adapter->stats.last_vfmprc; } #define UPDATE_STAT_32(reg, last, count) \ { \ u32 current = IXGBE_READ_REG(hw, reg); \ if (current < last) \ count += 0x100000000LL; \ last = current; \ count &= 0xFFFFFFFF00000000LL; \ count |= current; \ } #define UPDATE_STAT_36(lsb, msb, last, count) \ { \ u64 cur_lsb = IXGBE_READ_REG(hw, lsb); \ u64 cur_msb = IXGBE_READ_REG(hw, msb); \ u64 current = ((cur_msb << 32) | cur_lsb); \ if (current < last) \ count += 0x1000000000LL; \ last = current; \ count &= 0xFFFFFFF000000000LL; \ count |= current; \ } /* ** ixv_update_stats - Update the board statistics counters. */ void ixv_update_stats(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; UPDATE_STAT_32(IXGBE_VFGPRC, adapter->stats.last_vfgprc, adapter->stats.vfgprc); UPDATE_STAT_32(IXGBE_VFGPTC, adapter->stats.last_vfgptc, adapter->stats.vfgptc); UPDATE_STAT_36(IXGBE_VFGORC_LSB, IXGBE_VFGORC_MSB, adapter->stats.last_vfgorc, adapter->stats.vfgorc); UPDATE_STAT_36(IXGBE_VFGOTC_LSB, IXGBE_VFGOTC_MSB, adapter->stats.last_vfgotc, adapter->stats.vfgotc); UPDATE_STAT_32(IXGBE_VFMPRC, adapter->stats.last_vfmprc, adapter->stats.vfmprc); } /********************************************************************** * * This routine is called only when ixgbe_display_debug_stats is enabled. * This routine provides a way to take a look at important statistics * maintained by the driver and hardware. * **********************************************************************/ static void ixv_print_hw_stats(struct adapter * adapter) { device_t dev = adapter->dev; device_printf(dev,"Std Mbuf Failed = %lu\n", adapter->mbuf_defrag_failed); device_printf(dev,"Driver dropped packets = %lu\n", adapter->dropped_pkts); device_printf(dev, "watchdog timeouts = %ld\n", adapter->watchdog_events); device_printf(dev,"Good Packets Rcvd = %llu\n", (long long)adapter->stats.vfgprc); device_printf(dev,"Good Packets Xmtd = %llu\n", (long long)adapter->stats.vfgptc); device_printf(dev,"TSO Transmissions = %lu\n", adapter->tso_tx); } /********************************************************************** * * This routine is called only when em_display_debug_stats is enabled. * This routine provides a way to take a look at important statistics * maintained by the driver and hardware. * **********************************************************************/ static void ixv_print_debug_info(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; struct ix_queue *que = adapter->queues; struct rx_ring *rxr; struct tx_ring *txr; struct lro_ctrl *lro; device_printf(dev,"Error Byte Count = %u \n", IXGBE_READ_REG(hw, IXGBE_ERRBC)); for (int i = 0; i < adapter->num_queues; i++, que++) { txr = que->txr; rxr = que->rxr; lro = &rxr->lro; device_printf(dev,"QUE(%d) IRQs Handled: %lu\n", que->msix, (long)que->irqs); device_printf(dev,"RX(%d) Packets Received: %lld\n", rxr->me, (long long)rxr->rx_packets); device_printf(dev,"RX(%d) Split RX Packets: %lld\n", rxr->me, (long long)rxr->rx_split_packets); device_printf(dev,"RX(%d) Bytes Received: %lu\n", rxr->me, (long)rxr->rx_bytes); device_printf(dev,"RX(%d) LRO Queued= %d\n", rxr->me, lro->lro_queued); device_printf(dev,"RX(%d) LRO Flushed= %d\n", rxr->me, lro->lro_flushed); device_printf(dev,"TX(%d) Packets Sent: %lu\n", txr->me, (long)txr->total_packets); device_printf(dev,"TX(%d) NO Desc Avail: %lu\n", txr->me, (long)txr->no_desc_avail); } device_printf(dev,"MBX IRQ Handled: %lu\n", (long)adapter->mbx_irq); return; } static int ixv_sysctl_stats(SYSCTL_HANDLER_ARGS) { int error; int result; struct adapter *adapter; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); if (result == 1) { adapter = (struct adapter *) arg1; ixv_print_hw_stats(adapter); } return error; } static int ixv_sysctl_debug(SYSCTL_HANDLER_ARGS) { int error, result; struct adapter *adapter; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); if (result == 1) { adapter = (struct adapter *) arg1; ixv_print_debug_info(adapter); } return error; } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int ixv_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error; struct adapter *adapter; error = sysctl_handle_int(oidp, &ixv_flow_control, 0, req); if (error) return (error); adapter = (struct adapter *) arg1; switch (ixv_flow_control) { case ixgbe_fc_rx_pause: case ixgbe_fc_tx_pause: case ixgbe_fc_full: adapter->hw.fc.requested_mode = ixv_flow_control; break; case ixgbe_fc_none: default: adapter->hw.fc.requested_mode = ixgbe_fc_none; } ixgbe_fc_enable(&adapter->hw); return error; } static void ixv_add_rx_process_limit(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLFLAG_RW, limit, value, description); } Index: head/sys/dev/ixl/ixl_txrx.c =================================================================== --- head/sys/dev/ixl/ixl_txrx.c (revision 275357) +++ head/sys/dev/ixl/ixl_txrx.c (revision 275358) @@ -1,1673 +1,1673 @@ /****************************************************************************** Copyright (c) 2013-2014, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ /* ** IXL driver TX/RX Routines: ** This was seperated to allow usage by ** both the BASE and the VF drivers. */ #include "opt_inet.h" #include "opt_inet6.h" #include "ixl.h" /* Local Prototypes */ static void ixl_rx_checksum(struct mbuf *, u32, u32, u8); static void ixl_refresh_mbufs(struct ixl_queue *, int); static int ixl_xmit(struct ixl_queue *, struct mbuf **); static int ixl_tx_setup_offload(struct ixl_queue *, struct mbuf *, u32 *, u32 *); static bool ixl_tso_setup(struct ixl_queue *, struct mbuf *); static __inline void ixl_rx_discard(struct rx_ring *, int); static __inline void ixl_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u8); /* ** Multiqueue Transmit driver ** */ int ixl_mq_start(struct ifnet *ifp, struct mbuf *m) { struct ixl_vsi *vsi = ifp->if_softc; struct ixl_queue *que; struct tx_ring *txr; int err, i; - /* Which queue to use */ - if ((m->m_flags & M_FLOWID) != 0) + /* check if flowid is set */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % vsi->num_queues; else i = curcpu % vsi->num_queues; /* Check for a hung queue and pick alternative */ if (((1 << i) & vsi->active_queues) == 0) i = ffsl(vsi->active_queues); que = &vsi->queues[i]; txr = &que->txr; err = drbr_enqueue(ifp, txr->br, m); if (err) return(err); if (IXL_TX_TRYLOCK(txr)) { ixl_mq_start_locked(ifp, txr); IXL_TX_UNLOCK(txr); } else taskqueue_enqueue(que->tq, &que->tx_task); return (0); } int ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct ixl_queue *que = txr->que; struct ixl_vsi *vsi = que->vsi; struct mbuf *next; int err = 0; if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || vsi->link_active == 0) return (ENETDOWN); /* Process the transmit queue */ while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = ixl_xmit(que, &next)) != 0) { if (next == NULL) drbr_advance(ifp, txr->br); else drbr_putback(ifp, txr->br, next); break; } drbr_advance(ifp, txr->br); /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (txr->avail < IXL_TX_CLEANUP_THRESHOLD) ixl_txeof(que); return (err); } /* * Called from a taskqueue to drain queued transmit packets. */ void ixl_deferred_mq_start(void *arg, int pending) { struct ixl_queue *que = arg; struct tx_ring *txr = &que->txr; struct ixl_vsi *vsi = que->vsi; struct ifnet *ifp = vsi->ifp; IXL_TX_LOCK(txr); if (!drbr_empty(ifp, txr->br)) ixl_mq_start_locked(ifp, txr); IXL_TX_UNLOCK(txr); } /* ** Flush all queue ring buffers */ void ixl_qflush(struct ifnet *ifp) { struct ixl_vsi *vsi = ifp->if_softc; for (int i = 0; i < vsi->num_queues; i++) { struct ixl_queue *que = &vsi->queues[i]; struct tx_ring *txr = &que->txr; struct mbuf *m; IXL_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IXL_TX_UNLOCK(txr); } if_qflush(ifp); } /* ** Find mbuf chains passed to the driver ** that are 'sparse', using more than 8 ** mbufs to deliver an mss-size chunk of data */ static inline bool ixl_tso_detect_sparse(struct mbuf *mp) { struct mbuf *m; int num = 0, mss; bool ret = FALSE; mss = mp->m_pkthdr.tso_segsz; for (m = mp->m_next; m != NULL; m = m->m_next) { num++; mss -= m->m_len; if (mss < 1) break; if (m->m_next == NULL) break; } if (num > IXL_SPARSE_CHAIN) ret = TRUE; return (ret); } /********************************************************************* * * This routine maps the mbufs to tx descriptors, allowing the * TX engine to transmit the packets. * - return 0 on success, positive on failure * **********************************************************************/ #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS) static int ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp) { struct ixl_vsi *vsi = que->vsi; struct i40e_hw *hw = vsi->hw; struct tx_ring *txr = &que->txr; struct ixl_tx_buf *buf; struct i40e_tx_desc *txd = NULL; struct mbuf *m_head, *m; int i, j, error, nsegs, maxsegs; int first, last = 0; u16 vtag = 0; u32 cmd, off; bus_dmamap_t map; bus_dma_tag_t tag; bus_dma_segment_t segs[IXL_MAX_TSO_SEGS]; cmd = off = 0; m_head = *m_headp; /* * Important to capture the first descriptor * used because it will contain the index of * the one we tell the hardware to report back */ first = txr->next_avail; buf = &txr->buffers[first]; map = buf->map; tag = txr->tx_tag; maxsegs = IXL_MAX_TX_SEGS; if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { /* Use larger mapping for TSO */ tag = txr->tso_tag; maxsegs = IXL_MAX_TSO_SEGS; if (ixl_tso_detect_sparse(m_head)) { m = m_defrag(m_head, M_NOWAIT); if (m == NULL) { m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; } } /* * Map the packet for DMA. */ error = bus_dmamap_load_mbuf_sg(tag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m; m = m_collapse(*m_headp, M_NOWAIT, maxsegs); if (m == NULL) { que->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; /* Try it again */ error = bus_dmamap_load_mbuf_sg(tag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (error == ENOMEM) { que->tx_dma_setup++; return (error); } else if (error != 0) { que->tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } else if (error == ENOMEM) { que->tx_dma_setup++; return (error); } else if (error != 0) { que->tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } /* Make certain there are enough descriptors */ if (nsegs > txr->avail - 2) { txr->no_desc++; error = ENOBUFS; goto xmit_fail; } m_head = *m_headp; /* Set up the TSO/CSUM offload */ if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) { error = ixl_tx_setup_offload(que, m_head, &cmd, &off); if (error) goto xmit_fail; } cmd |= I40E_TX_DESC_CMD_ICRC; /* Grab the VLAN tag */ if (m_head->m_flags & M_VLANTAG) { cmd |= I40E_TX_DESC_CMD_IL2TAG1; vtag = htole16(m_head->m_pkthdr.ether_vtag); } i = txr->next_avail; for (j = 0; j < nsegs; j++) { bus_size_t seglen; buf = &txr->buffers[i]; buf->tag = tag; /* Keep track of the type tag */ txd = &txr->base[i]; seglen = segs[j].ds_len; txd->buffer_addr = htole64(segs[j].ds_addr); txd->cmd_type_offset_bsz = htole64(I40E_TX_DESC_DTYPE_DATA | ((u64)cmd << I40E_TXD_QW1_CMD_SHIFT) | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT) | ((u64)seglen << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) | ((u64)vtag << I40E_TXD_QW1_L2TAG1_SHIFT)); last = i; /* descriptor that will get completion IRQ */ if (++i == que->num_desc) i = 0; buf->m_head = NULL; buf->eop_index = -1; } /* Set the last descriptor for report */ txd->cmd_type_offset_bsz |= htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT)); txr->avail -= nsegs; txr->next_avail = i; buf->m_head = m_head; /* Swap the dma map between the first and last descriptor */ txr->buffers[first].map = buf->map; buf->map = map; bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE); /* Set the index of the descriptor that will be marked done */ buf = &txr->buffers[first]; buf->eop_index = last; bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * Advance the Transmit Descriptor Tail (Tdt), this tells the * hardware that this frame is available to transmit. */ ++txr->total_packets; wr32(hw, txr->tail, i); ixl_flush(hw); /* Mark outstanding work */ if (que->busy == 0) que->busy = 1; return (0); xmit_fail: bus_dmamap_unload(tag, buf->map); return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ int ixl_allocate_tx_data(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; struct ixl_vsi *vsi = que->vsi; device_t dev = vsi->dev; struct ixl_tx_buf *buf; int error = 0; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IXL_TSO_SIZE, /* maxsize */ IXL_MAX_TX_SEGS, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->tx_tag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } /* Make a special tag for TSO */ if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IXL_TSO_SIZE, /* maxsize */ IXL_MAX_TSO_SEGS, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->tso_tag))) { device_printf(dev,"Unable to allocate TX TSO DMA tag\n"); goto fail; } if (!(txr->buffers = (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) * que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer default dma maps */ buf = txr->buffers; for (int i = 0; i < que->num_desc; i++, buf++) { buf->tag = txr->tx_tag; error = bus_dmamap_create(buf->tag, 0, &buf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } fail: return (error); } /********************************************************************* * * (Re)Initialize a queue transmit ring. * - called by init, it clears the descriptor ring, * and frees any stale mbufs * **********************************************************************/ void ixl_init_tx_ring(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; struct ixl_tx_buf *buf; /* Clear the old ring contents */ IXL_TX_LOCK(txr); bzero((void *)txr->base, (sizeof(struct i40e_tx_desc)) * que->num_desc); /* Reset indices */ txr->next_avail = 0; txr->next_to_clean = 0; #ifdef IXL_FDIR /* Initialize flow director */ txr->atr_rate = ixl_atr_rate; txr->atr_count = 0; #endif /* Free any existing tx mbufs. */ buf = txr->buffers; for (int i = 0; i < que->num_desc; i++, buf++) { if (buf->m_head != NULL) { bus_dmamap_sync(buf->tag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(buf->tag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; } /* Clear the EOP index */ buf->eop_index = -1; } /* Set number of descriptors available */ txr->avail = que->num_desc; bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IXL_TX_UNLOCK(txr); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ void ixl_free_que_tx(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; struct ixl_tx_buf *buf; INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me); for (int i = 0; i < que->num_desc; i++) { buf = &txr->buffers[i]; if (buf->m_head != NULL) { bus_dmamap_sync(buf->tag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(buf->tag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; if (buf->map != NULL) { bus_dmamap_destroy(buf->tag, buf->map); buf->map = NULL; } } else if (buf->map != NULL) { bus_dmamap_unload(buf->tag, buf->map); bus_dmamap_destroy(buf->tag, buf->map); buf->map = NULL; } } if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); if (txr->buffers != NULL) { free(txr->buffers, M_DEVBUF); txr->buffers = NULL; } if (txr->tx_tag != NULL) { bus_dma_tag_destroy(txr->tx_tag); txr->tx_tag = NULL; } if (txr->tso_tag != NULL) { bus_dma_tag_destroy(txr->tso_tag); txr->tso_tag = NULL; } INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me); return; } /********************************************************************* * * Setup descriptor for hw offloads * **********************************************************************/ static int ixl_tx_setup_offload(struct ixl_queue *que, struct mbuf *mp, u32 *cmd, u32 *off) { struct ether_vlan_header *eh; #ifdef INET struct ip *ip = NULL; #endif struct tcphdr *th = NULL; #ifdef INET6 struct ip6_hdr *ip6; #endif int elen, ip_hlen = 0, tcp_hlen; u16 etype; u8 ipproto = 0; bool tso = FALSE; /* Set up the TSO context descriptor if required */ if (mp->m_pkthdr.csum_flags & CSUM_TSO) { tso = ixl_tso_setup(que, mp); if (tso) ++que->tso; else return (ENXIO); } /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); elen = ETHER_HDR_LEN; } switch (etype) { #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + elen); ip_hlen = ip->ip_hl << 2; ipproto = ip->ip_p; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); /* The IP checksum must be recalculated with TSO */ if (tso) *cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; else *cmd |= I40E_TX_DESC_CMD_IIPT_IPV4; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + elen); ip_hlen = sizeof(struct ip6_hdr); ipproto = ip6->ip6_nxt; th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); *cmd |= I40E_TX_DESC_CMD_IIPT_IPV6; break; #endif default: break; } *off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; *off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; switch (ipproto) { case IPPROTO_TCP: tcp_hlen = th->th_off << 2; if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) { *cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; *off |= (tcp_hlen >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } #ifdef IXL_FDIR ixl_atr(que, th, etype); #endif break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) { *cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; *off |= (sizeof(struct udphdr) >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } break; case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) { *cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; *off |= (sizeof(struct sctphdr) >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } /* Fall Thru */ default: break; } return (0); } /********************************************************************** * * Setup context for hardware segmentation offload (TSO) * **********************************************************************/ static bool ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp) { struct tx_ring *txr = &que->txr; struct i40e_tx_context_desc *TXD; struct ixl_tx_buf *buf; u32 cmd, mss, type, tsolen; u16 etype; int idx, elen, ip_hlen, tcp_hlen; struct ether_vlan_header *eh; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif #if defined(INET6) || defined(INET) struct tcphdr *th; #endif u64 type_cmd_tso_mss; /* * Determine where frame payload starts. * Jump over vlan headers if already present */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = eh->evl_proto; } else { elen = ETHER_HDR_LEN; etype = eh->evl_encap_proto; } switch (ntohs(etype)) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + elen); if (ip6->ip6_nxt != IPPROTO_TCP) return (ENXIO); ip_hlen = sizeof(struct ip6_hdr); th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); tcp_hlen = th->th_off << 2; break; #endif #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + elen); if (ip->ip_p != IPPROTO_TCP) return (ENXIO); ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); tcp_hlen = th->th_off << 2; break; #endif default: printf("%s: CSUM_TSO but no supported IP version (0x%04x)", __func__, ntohs(etype)); return FALSE; } /* Ensure we have at least the IP+TCP header in the first mbuf. */ if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr)) return FALSE; idx = txr->next_avail; buf = &txr->buffers[idx]; TXD = (struct i40e_tx_context_desc *) &txr->base[idx]; tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen); type = I40E_TX_DESC_DTYPE_CONTEXT; cmd = I40E_TX_CTX_DESC_TSO; mss = mp->m_pkthdr.tso_segsz; type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) | ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) | ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT); TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss); TXD->tunneling_params = htole32(0); buf->m_head = NULL; buf->eop_index = -1; if (++idx == que->num_desc) idx = 0; txr->avail--; txr->next_avail = idx; return TRUE; } /* ** ixl_get_tx_head - Retrieve the value from the ** location the HW records its HEAD index */ static inline u32 ixl_get_tx_head(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; void *head = &txr->base[que->num_desc]; return LE32_TO_CPU(*(volatile __le32 *)head); } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * **********************************************************************/ bool ixl_txeof(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; u32 first, last, head, done, processed; struct ixl_tx_buf *buf; struct i40e_tx_desc *tx_desc, *eop_desc; mtx_assert(&txr->mtx, MA_OWNED); /* These are not the descriptors you seek, move along :) */ if (txr->avail == que->num_desc) { que->busy = 0; return FALSE; } processed = 0; first = txr->next_to_clean; buf = &txr->buffers[first]; tx_desc = (struct i40e_tx_desc *)&txr->base[first]; last = buf->eop_index; if (last == -1) return FALSE; eop_desc = (struct i40e_tx_desc *)&txr->base[last]; /* Get the Head WB value */ head = ixl_get_tx_head(que); /* ** Get the index of the first descriptor ** BEYOND the EOP and call that 'done'. ** I do this so the comparison in the ** inner while loop below can be simple */ if (++last == que->num_desc) last = 0; done = last; bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_POSTREAD); /* ** The HEAD index of the ring is written in a ** defined location, this rather than a done bit ** is what is used to keep track of what must be ** 'cleaned'. */ while (first != head) { /* We clean the range of the packet */ while (first != done) { ++txr->avail; ++processed; if (buf->m_head) { txr->bytes += /* for ITR adjustment */ buf->m_head->m_pkthdr.len; txr->tx_bytes += /* for TX stats */ buf->m_head->m_pkthdr.len; bus_dmamap_sync(buf->tag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(buf->tag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; buf->map = NULL; } buf->eop_index = -1; if (++first == que->num_desc) first = 0; buf = &txr->buffers[first]; tx_desc = &txr->base[first]; } ++txr->packets; /* See if there is more work now */ last = buf->eop_index; if (last != -1) { eop_desc = &txr->base[last]; /* Get next done point */ if (++last == que->num_desc) last = 0; done = last; } else break; } bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); txr->next_to_clean = first; /* ** Hang detection, we know there's ** work outstanding or the first return ** would have been taken, so indicate an ** unsuccessful pass, in local_timer if ** the value is too great the queue will ** be considered hung. If anything has been ** cleaned then reset the state. */ if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG)) ++que->busy; if (processed) que->busy = 1; /* Note this turns off HUNG */ /* * If there are no pending descriptors, clear the timeout. */ if (txr->avail == que->num_desc) { que->busy = 0; return FALSE; } return TRUE; } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void ixl_refresh_mbufs(struct ixl_queue *que, int limit) { struct ixl_vsi *vsi = que->vsi; struct rx_ring *rxr = &que->rxr; bus_dma_segment_t hseg[1]; bus_dma_segment_t pseg[1]; struct ixl_rx_buf *buf; struct mbuf *mh, *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_refresh; /* Control the loop with one beyond */ if (++j == que->num_desc) j = 0; while (j != limit) { buf = &rxr->buffers[i]; if (rxr->hdr_split == FALSE) goto no_split; if (buf->m_head == NULL) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) goto update; } else mh = buf->m_head; mh->m_pkthdr.len = mh->m_len = MHLEN; mh->m_len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: hdr dmamap load" " failure - %d\n", error); m_free(mh); buf->m_head = NULL; goto update; } buf->m_head = mh; bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_PREREAD); rxr->base[i].read.hdr_addr = htole64(hseg[0].ds_addr); no_split: if (buf->m_pack == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxr->mbuf_sz); if (mp == NULL) goto update; } else mp = buf->m_pack; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: payload dmamap load" " failure - %d\n", error); m_free(mp); buf->m_pack = NULL; goto update; } buf->m_pack = mp; bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_PREREAD); rxr->base[i].read.pkt_addr = htole64(pseg[0].ds_addr); /* Used only when doing header split */ rxr->base[i].read.hdr_addr = 0; refreshed = TRUE; /* Next is precalculated */ i = j; rxr->next_refresh = i; if (++j == que->num_desc) j = 0; } update: if (refreshed) /* Update hardware tail index */ wr32(vsi->hw, rxr->tail, rxr->next_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per descriptor, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've defined. * **********************************************************************/ int ixl_allocate_rx_data(struct ixl_queue *que) { struct rx_ring *rxr = &que->rxr; struct ixl_vsi *vsi = que->vsi; device_t dev = vsi->dev; struct ixl_rx_buf *buf; int i, bsize, error; bsize = sizeof(struct ixl_rx_buf) * que->num_desc; if (!(rxr->buffers = (struct ixl_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; return (error); } if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MSIZE, /* maxsize */ 1, /* nsegments */ MSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->htag))) { device_printf(dev, "Unable to create RX DMA htag\n"); return (error); } if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM16BYTES, /* maxsize */ 1, /* nsegments */ MJUM16BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX DMA ptag\n"); return (error); } for (i = 0; i < que->num_desc; i++) { buf = &rxr->buffers[i]; error = bus_dmamap_create(rxr->htag, BUS_DMA_NOWAIT, &buf->hmap); if (error) { device_printf(dev, "Unable to create RX head map\n"); break; } error = bus_dmamap_create(rxr->ptag, BUS_DMA_NOWAIT, &buf->pmap); if (error) { device_printf(dev, "Unable to create RX pkt map\n"); break; } } return (error); } /********************************************************************* * * (Re)Initialize the queue receive ring and its buffers. * **********************************************************************/ int ixl_init_rx_ring(struct ixl_queue *que) { struct rx_ring *rxr = &que->rxr; struct ixl_vsi *vsi = que->vsi; #if defined(INET6) || defined(INET) struct ifnet *ifp = vsi->ifp; struct lro_ctrl *lro = &rxr->lro; #endif struct ixl_rx_buf *buf; bus_dma_segment_t pseg[1], hseg[1]; int rsize, nsegs, error = 0; IXL_RX_LOCK(rxr); /* Clear the ring contents */ rsize = roundup2(que->num_desc * sizeof(union i40e_rx_desc), DBA_ALIGN); bzero((void *)rxr->base, rsize); /* Cleanup any existing buffers */ for (int i = 0; i < que->num_desc; i++) { buf = &rxr->buffers[i]; if (buf->m_head != NULL) { bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, buf->hmap); buf->m_head->m_flags |= M_PKTHDR; m_freem(buf->m_head); } if (buf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, buf->pmap); buf->m_pack->m_flags |= M_PKTHDR; m_freem(buf->m_pack); } buf->m_head = NULL; buf->m_pack = NULL; } /* header split is off */ rxr->hdr_split = FALSE; /* Now replenish the mbufs */ for (int j = 0; j != que->num_desc; ++j) { struct mbuf *mh, *mp; buf = &rxr->buffers[j]; /* ** Don't allocate mbufs if not ** doing header split, its wasteful */ if (rxr->hdr_split == FALSE) goto skip_head; /* First the header */ buf->m_head = m_gethdr(M_NOWAIT, MT_DATA); if (buf->m_head == NULL) { error = ENOBUFS; goto fail; } m_adj(buf->m_head, ETHER_ALIGN); mh = buf->m_head; mh->m_len = mh->m_pkthdr.len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, buf->hmap, buf->m_head, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) /* Nothing elegant to do here */ goto fail; bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr); skip_head: /* Now the payload cluster */ buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxr->mbuf_sz); if (buf->m_pack == NULL) { error = ENOBUFS; goto fail; } mp = buf->m_pack; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr); rxr->base[j].read.hdr_addr = 0; } /* Setup our descriptor indices */ rxr->next_check = 0; rxr->next_refresh = 0; rxr->lro_enabled = FALSE; rxr->split = 0; rxr->bytes = 0; rxr->discard = FALSE; wr32(vsi->hw, rxr->tail, que->num_desc - 1); ixl_flush(vsi->hw); #if defined(INET6) || defined(INET) /* ** Now set up the LRO interface: */ if (ifp->if_capenable & IFCAP_LRO) { int err = tcp_lro_init(lro); if (err) { if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me); goto fail; } INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me); rxr->lro_enabled = TRUE; lro->ifp = vsi->ifp; } #endif bus_dmamap_sync(rxr->dma.tag, rxr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); fail: IXL_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Free station receive ring data structures * **********************************************************************/ void ixl_free_que_rx(struct ixl_queue *que) { struct rx_ring *rxr = &que->rxr; struct ixl_rx_buf *buf; INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me); /* Cleanup any existing buffers */ if (rxr->buffers != NULL) { for (int i = 0; i < que->num_desc; i++) { buf = &rxr->buffers[i]; if (buf->m_head != NULL) { bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, buf->hmap); buf->m_head->m_flags |= M_PKTHDR; m_freem(buf->m_head); } if (buf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, buf->pmap); buf->m_pack->m_flags |= M_PKTHDR; m_freem(buf->m_pack); } buf->m_head = NULL; buf->m_pack = NULL; if (buf->hmap != NULL) { bus_dmamap_destroy(rxr->htag, buf->hmap); buf->hmap = NULL; } if (buf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, buf->pmap); buf->pmap = NULL; } } if (rxr->buffers != NULL) { free(rxr->buffers, M_DEVBUF); rxr->buffers = NULL; } } if (rxr->htag != NULL) { bus_dma_tag_destroy(rxr->htag); rxr->htag = NULL; } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me); return; } static __inline void ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype) { #if defined(INET6) || defined(INET) /* * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } #endif IXL_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IXL_RX_LOCK(rxr); } static __inline void ixl_rx_discard(struct rx_ring *rxr, int i) { struct ixl_rx_buf *rbuf; rbuf = &rxr->buffers[i]; if (rbuf->fmp != NULL) {/* Partial chain ? */ rbuf->fmp->m_flags |= M_PKTHDR; m_freem(rbuf->fmp); rbuf->fmp = NULL; } /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->m_head) { m_free(rbuf->m_head); rbuf->m_head = NULL; } if (rbuf->m_pack) { m_free(rbuf->m_pack); rbuf->m_pack = NULL; } return; } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE for more work, FALSE for all clean. *********************************************************************/ bool ixl_rxeof(struct ixl_queue *que, int count) { struct ixl_vsi *vsi = que->vsi; struct rx_ring *rxr = &que->rxr; struct ifnet *ifp = vsi->ifp; #if defined(INET6) || defined(INET) struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; #endif int i, nextp, processed = 0; union i40e_rx_desc *cur; struct ixl_rx_buf *rbuf, *nbuf; IXL_RX_LOCK(rxr); for (i = rxr->next_check; count != 0;) { struct mbuf *sendmp, *mh, *mp; u32 rsc, status, error; u16 hlen, plen, vtag; u64 qword; u8 ptype; bool eop; /* Sync the ring. */ bus_dmamap_sync(rxr->dma.tag, rxr->dma.map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); cur = &rxr->base[i]; qword = le64toh(cur->wb.qword1.status_error_len); status = (qword & I40E_RXD_QW1_STATUS_MASK) >> I40E_RXD_QW1_STATUS_SHIFT; error = (qword & I40E_RXD_QW1_ERROR_MASK) >> I40E_RXD_QW1_ERROR_SHIFT; plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT; hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK) >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT; ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT; if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) { ++rxr->not_done; break; } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = NULL; nbuf = NULL; rsc = 0; cur->wb.qword1.status_error_len = 0; rbuf = &rxr->buffers[i]; mh = rbuf->m_head; mp = rbuf->m_pack; eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT)); if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1); else vtag = 0; /* ** Make sure bad packets are discarded, ** note that only EOP descriptor has valid ** error results. */ if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) { rxr->discarded++; ixl_rx_discard(rxr, i); goto next_desc; } /* Prefetch the next buffer */ if (!eop) { nextp = i + 1; if (nextp == que->num_desc) nextp = 0; nbuf = &rxr->buffers[nextp]; prefetch(nbuf); } /* ** The header mbuf is ONLY used when header ** split is enabled, otherwise we get normal ** behavior, ie, both header and payload ** are DMA'd into the payload buffer. ** ** Rather than using the fmp/lmp global pointers ** we now keep the head of a packet chain in the ** buffer struct and pass this along from one ** descriptor to the next, until we get EOP. */ if (rxr->hdr_split && (rbuf->fmp == NULL)) { if (hlen > IXL_RX_HDR) hlen = IXL_RX_HDR; mh->m_len = hlen; mh->m_flags |= M_PKTHDR; mh->m_next = NULL; mh->m_pkthdr.len = mh->m_len; /* Null buf pointer so it is refreshed */ rbuf->m_head = NULL; /* ** Check the payload length, this ** could be zero if its a small ** packet. */ if (plen > 0) { mp->m_len = plen; mp->m_next = NULL; mp->m_flags &= ~M_PKTHDR; mh->m_next = mp; mh->m_pkthdr.len += mp->m_len; /* Null buf pointer so it is refreshed */ rbuf->m_pack = NULL; rxr->split++; } /* ** Now create the forward ** chain so when complete ** we wont have to. */ if (eop == 0) { /* stash the chain head */ nbuf->fmp = mh; /* Make forward chain */ if (plen) mp->m_next = nbuf->m_pack; else mh->m_next = nbuf->m_pack; } else { /* Singlet, prepare to send */ sendmp = mh; if (vtag) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } } } else { /* ** Either no header split, or a ** secondary piece of a fragmented ** split packet. */ mp->m_len = plen; /* ** See if there is a stored head ** that determines what we are */ sendmp = rbuf->fmp; rbuf->m_pack = rbuf->fmp = NULL; if (sendmp != NULL) /* secondary frag */ sendmp->m_pkthdr.len += mp->m_len; else { /* first desc of a non-ps chain */ sendmp = mp; sendmp->m_flags |= M_PKTHDR; sendmp->m_pkthdr.len = mp->m_len; if (vtag) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } } /* Pass the head pointer on */ if (eop == 0) { nbuf->fmp = sendmp; sendmp = NULL; mp->m_next = nbuf->m_pack; } } ++processed; /* Sending this frame? */ if (eop) { sendmp->m_pkthdr.rcvif = ifp; /* gather stats */ rxr->rx_packets++; rxr->rx_bytes += sendmp->m_pkthdr.len; /* capture data for dynamic ITR adjustment */ rxr->packets++; rxr->bytes += sendmp->m_pkthdr.len; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) ixl_rx_checksum(sendmp, status, error, ptype); sendmp->m_pkthdr.flowid = que->msix; - sendmp->m_flags |= M_FLOWID; + M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE); } next_desc: bus_dmamap_sync(rxr->dma.tag, rxr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == que->num_desc) i = 0; /* Now send to the stack or do LRO */ if (sendmp != NULL) { rxr->next_check = i; ixl_rx_input(rxr, ifp, sendmp, ptype); i = rxr->next_check; } /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { ixl_refresh_mbufs(que, i); processed = 0; } } /* Refresh any remaining buf structs */ if (ixl_rx_unrefreshed(que)) ixl_refresh_mbufs(que, i); rxr->next_check = i; #if defined(INET6) || defined(INET) /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } #endif IXL_RX_UNLOCK(rxr); return (FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype) { struct i40e_rx_ptype_decoded decoded; decoded = decode_rx_desc_ptype(ptype); /* Errors? */ if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) | (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) { mp->m_pkthdr.csum_flags = 0; return; } /* IPv6 with extension headers likely have bad csum */ if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP && decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) if (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) { mp->m_pkthdr.csum_flags = 0; return; } /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) { mp->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); mp->m_pkthdr.csum_data |= htons(0xffff); } return; } #if __FreeBSD_version >= 1100000 uint64_t ixl_get_counter(if_t ifp, ift_counter cnt) { struct ixl_vsi *vsi; vsi = if_getsoftc(ifp); switch (cnt) { case IFCOUNTER_IPACKETS: return (vsi->ipackets); case IFCOUNTER_IERRORS: return (vsi->ierrors); case IFCOUNTER_OPACKETS: return (vsi->opackets); case IFCOUNTER_OERRORS: return (vsi->oerrors); case IFCOUNTER_COLLISIONS: /* Collisions are by standard impossible in 40G/10G Ethernet */ return (0); case IFCOUNTER_IBYTES: return (vsi->ibytes); case IFCOUNTER_OBYTES: return (vsi->obytes); case IFCOUNTER_IMCASTS: return (vsi->imcasts); case IFCOUNTER_OMCASTS: return (vsi->omcasts); case IFCOUNTER_IQDROPS: return (vsi->iqdrops); case IFCOUNTER_OQDROPS: return (vsi->oqdrops); case IFCOUNTER_NOPROTO: return (vsi->noproto); default: return (if_get_counter_default(ifp, cnt)); } } #endif Index: head/sys/dev/mxge/if_mxge.c =================================================================== --- head/sys/dev/mxge/if_mxge.c (revision 275357) +++ head/sys/dev/mxge/if_mxge.c (revision 275358) @@ -1,5032 +1,5032 @@ /****************************************************************************** Copyright (c) 2006-2013, Myricom Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Myricom Inc, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for pci_cfg_restore */ #include /* for pmap_mapdev() */ #include #if defined(__i386) || defined(__amd64) #include #endif #include #include /*#define MXGE_FAKE_IFP*/ #include #ifdef IFNET_BUF_RING #include #endif #include "opt_inet.h" #include "opt_inet6.h" /* tunable params */ static int mxge_nvidia_ecrc_enable = 1; static int mxge_force_firmware = 0; static int mxge_intr_coal_delay = 30; static int mxge_deassert_wait = 1; static int mxge_flow_control = 1; static int mxge_verbose = 0; static int mxge_ticks; static int mxge_max_slices = 1; static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT; static int mxge_always_promisc = 0; static int mxge_initial_mtu = ETHERMTU_JUMBO; static int mxge_throttle = 0; static char *mxge_fw_unaligned = "mxge_ethp_z8e"; static char *mxge_fw_aligned = "mxge_eth_z8e"; static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e"; static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e"; static int mxge_probe(device_t dev); static int mxge_attach(device_t dev); static int mxge_detach(device_t dev); static int mxge_shutdown(device_t dev); static void mxge_intr(void *arg); static device_method_t mxge_methods[] = { /* Device interface */ DEVMETHOD(device_probe, mxge_probe), DEVMETHOD(device_attach, mxge_attach), DEVMETHOD(device_detach, mxge_detach), DEVMETHOD(device_shutdown, mxge_shutdown), DEVMETHOD_END }; static driver_t mxge_driver = { "mxge", mxge_methods, sizeof(mxge_softc_t), }; static devclass_t mxge_devclass; /* Declare ourselves to be a child of the PCI bus.*/ DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0); MODULE_DEPEND(mxge, firmware, 1, 1, 1); MODULE_DEPEND(mxge, zlib, 1, 1, 1); static int mxge_load_firmware(mxge_softc_t *sc, int adopt); static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data); static int mxge_close(mxge_softc_t *sc, int down); static int mxge_open(mxge_softc_t *sc); static void mxge_tick(void *arg); static int mxge_probe(device_t dev) { int rev; if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) && ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) || (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) { rev = pci_get_revid(dev); switch (rev) { case MXGE_PCI_REV_Z8E: device_set_desc(dev, "Myri10G-PCIE-8A"); break; case MXGE_PCI_REV_Z8ES: device_set_desc(dev, "Myri10G-PCIE-8B"); break; default: device_set_desc(dev, "Myri10G-PCIE-8??"); device_printf(dev, "Unrecognized rev %d NIC\n", rev); break; } return 0; } return ENXIO; } static void mxge_enable_wc(mxge_softc_t *sc) { #if defined(__i386) || defined(__amd64) vm_offset_t len; int err; sc->wc = 1; len = rman_get_size(sc->mem_res); err = pmap_change_attr((vm_offset_t) sc->sram, len, PAT_WRITE_COMBINING); if (err != 0) { device_printf(sc->dev, "pmap_change_attr failed, %d\n", err); sc->wc = 0; } #endif } /* callback to get our DMA address */ static void mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { if (error == 0) { *(bus_addr_t *) arg = segs->ds_addr; } } static int mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes, bus_size_t alignment) { int err; device_t dev = sc->dev; bus_size_t boundary, maxsegsize; if (bytes > 4096 && alignment == 4096) { boundary = 0; maxsegsize = bytes; } else { boundary = 4096; maxsegsize = 4096; } /* allocate DMAable memory tags */ err = bus_dma_tag_create(sc->parent_dmat, /* parent */ alignment, /* alignment */ boundary, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ bytes, /* maxsize */ 1, /* num segs */ maxsegsize, /* maxsegsize */ BUS_DMA_COHERENT, /* flags */ NULL, NULL, /* lock */ &dma->dmat); /* tag */ if (err != 0) { device_printf(dev, "couldn't alloc tag (err = %d)\n", err); return err; } /* allocate DMAable memory & map */ err = bus_dmamem_alloc(dma->dmat, &dma->addr, (BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO), &dma->map); if (err != 0) { device_printf(dev, "couldn't alloc mem (err = %d)\n", err); goto abort_with_dmat; } /* load the memory */ err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes, mxge_dmamap_callback, (void *)&dma->bus_addr, 0); if (err != 0) { device_printf(dev, "couldn't load map (err = %d)\n", err); goto abort_with_mem; } return 0; abort_with_mem: bus_dmamem_free(dma->dmat, dma->addr, dma->map); abort_with_dmat: (void)bus_dma_tag_destroy(dma->dmat); return err; } static void mxge_dma_free(mxge_dma_t *dma) { bus_dmamap_unload(dma->dmat, dma->map); bus_dmamem_free(dma->dmat, dma->addr, dma->map); (void)bus_dma_tag_destroy(dma->dmat); } /* * The eeprom strings on the lanaiX have the format * SN=x\0 * MAC=x:x:x:x:x:x\0 * PC=text\0 */ static int mxge_parse_strings(mxge_softc_t *sc) { char *ptr; int i, found_mac, found_sn2; char *endptr; ptr = sc->eeprom_strings; found_mac = 0; found_sn2 = 0; while (*ptr != '\0') { if (strncmp(ptr, "MAC=", 4) == 0) { ptr += 4; for (i = 0;;) { sc->mac_addr[i] = strtoul(ptr, &endptr, 16); if (endptr - ptr != 2) goto abort; ptr = endptr; if (++i == 6) break; if (*ptr++ != ':') goto abort; } found_mac = 1; } else if (strncmp(ptr, "PC=", 3) == 0) { ptr += 3; strlcpy(sc->product_code_string, ptr, sizeof(sc->product_code_string)); } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) { ptr += 3; strlcpy(sc->serial_number_string, ptr, sizeof(sc->serial_number_string)); } else if (strncmp(ptr, "SN2=", 4) == 0) { /* SN2 takes precedence over SN */ ptr += 4; found_sn2 = 1; strlcpy(sc->serial_number_string, ptr, sizeof(sc->serial_number_string)); } while (*ptr++ != '\0') {} } if (found_mac) return 0; abort: device_printf(sc->dev, "failed to parse eeprom_strings\n"); return ENXIO; } #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__ static void mxge_enable_nvidia_ecrc(mxge_softc_t *sc) { uint32_t val; unsigned long base, off; char *va, *cfgptr; device_t pdev, mcp55; uint16_t vendor_id, device_id, word; uintptr_t bus, slot, func, ivend, idev; uint32_t *ptr32; if (!mxge_nvidia_ecrc_enable) return; pdev = device_get_parent(device_get_parent(sc->dev)); if (pdev == NULL) { device_printf(sc->dev, "could not find parent?\n"); return; } vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2); device_id = pci_read_config(pdev, PCIR_DEVICE, 2); if (vendor_id != 0x10de) return; base = 0; if (device_id == 0x005d) { /* ck804, base address is magic */ base = 0xe0000000UL; } else if (device_id >= 0x0374 && device_id <= 0x378) { /* mcp55, base address stored in chipset */ mcp55 = pci_find_bsf(0, 0, 0); if (mcp55 && 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) && 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) { word = pci_read_config(mcp55, 0x90, 2); base = ((unsigned long)word & 0x7ffeU) << 25; } } if (!base) return; /* XXXX Test below is commented because it is believed that doing config read/write beyond 0xff will access the config space for the next larger function. Uncomment this and remove the hacky pmap_mapdev() way of accessing config space when FreeBSD grows support for extended pcie config space access */ #if 0 /* See if we can, by some miracle, access the extended config space */ val = pci_read_config(pdev, 0x178, 4); if (val != 0xffffffff) { val |= 0x40; pci_write_config(pdev, 0x178, val, 4); return; } #endif /* Rather than using normal pci config space writes, we must * map the Nvidia config space ourselves. This is because on * opteron/nvidia class machine the 0xe000000 mapping is * handled by the nvidia chipset, that means the internal PCI * device (the on-chip northbridge), or the amd-8131 bridge * and things behind them are not visible by this method. */ BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_BUS, &bus); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_SLOT, &slot); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_FUNCTION, &func); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_VENDOR, &ivend); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_DEVICE, &idev); off = base + 0x00100000UL * (unsigned long)bus + 0x00001000UL * (unsigned long)(func + 8 * slot); /* map it into the kernel */ va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE); if (va == NULL) { device_printf(sc->dev, "pmap_kenter_temporary didn't\n"); return; } /* get a pointer to the config space mapped into the kernel */ cfgptr = va + (off & PAGE_MASK); /* make sure that we can really access it */ vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR); device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE); if (! (vendor_id == ivend && device_id == idev)) { device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n", vendor_id, device_id); pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); return; } ptr32 = (uint32_t*)(cfgptr + 0x178); val = *ptr32; if (val == 0xffffffff) { device_printf(sc->dev, "extended mapping failed\n"); pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); return; } *ptr32 = val | 0x40; pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); if (mxge_verbose) device_printf(sc->dev, "Enabled ECRC on upstream Nvidia bridge " "at %d:%d:%d\n", (int)bus, (int)slot, (int)func); return; } #else static void mxge_enable_nvidia_ecrc(mxge_softc_t *sc) { device_printf(sc->dev, "Nforce 4 chipset on non-x86/amd64!?!?!\n"); return; } #endif static int mxge_dma_test(mxge_softc_t *sc, int test_type) { mxge_cmd_t cmd; bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr; int status; uint32_t len; char *test = " "; /* Run a small DMA test. * The magic multipliers to the length tell the firmware * to do DMA read, write, or read+write tests. The * results are returned in cmd.data0. The upper 16 * bits of the return is the number of transfers completed. * The lower 16 bits is the time in 0.5us ticks that the * transfers took to complete. */ len = sc->tx_boundary; cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); cmd.data2 = len * 0x10000; status = mxge_send_cmd(sc, test_type, &cmd); if (status != 0) { test = "read"; goto abort; } sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff); cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); cmd.data2 = len * 0x1; status = mxge_send_cmd(sc, test_type, &cmd); if (status != 0) { test = "write"; goto abort; } sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff); cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); cmd.data2 = len * 0x10001; status = mxge_send_cmd(sc, test_type, &cmd); if (status != 0) { test = "read/write"; goto abort; } sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) / (cmd.data0 & 0xffff); abort: if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) device_printf(sc->dev, "DMA %s benchmark failed: %d\n", test, status); return status; } /* * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput * when the PCI-E Completion packets are aligned on an 8-byte * boundary. Some PCI-E chip sets always align Completion packets; on * the ones that do not, the alignment can be enforced by enabling * ECRC generation (if supported). * * When PCI-E Completion packets are not aligned, it is actually more * efficient to limit Read-DMA transactions to 2KB, rather than 4KB. * * If the driver can neither enable ECRC nor verify that it has * already been enabled, then it must use a firmware image which works * around unaligned completion packets (ethp_z8e.dat), and it should * also ensure that it never gives the device a Read-DMA which is * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is * enabled, then the driver should use the aligned (eth_z8e.dat) * firmware image, and set tx_boundary to 4KB. */ static int mxge_firmware_probe(mxge_softc_t *sc) { device_t dev = sc->dev; int reg, status; uint16_t pectl; sc->tx_boundary = 4096; /* * Verify the max read request size was set to 4KB * before trying the test with 4KB. */ if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) { pectl = pci_read_config(dev, reg + 0x8, 2); if ((pectl & (5 << 12)) != (5 << 12)) { device_printf(dev, "Max Read Req. size != 4k (0x%x\n", pectl); sc->tx_boundary = 2048; } } /* * load the optimized firmware (which assumes aligned PCIe * completions) in order to see if it works on this host. */ sc->fw_name = mxge_fw_aligned; status = mxge_load_firmware(sc, 1); if (status != 0) { return status; } /* * Enable ECRC if possible */ mxge_enable_nvidia_ecrc(sc); /* * Run a DMA test which watches for unaligned completions and * aborts on the first one seen. Not required on Z8ES or newer. */ if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES) return 0; status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST); if (status == 0) return 0; /* keep the aligned firmware */ if (status != E2BIG) device_printf(dev, "DMA test failed: %d\n", status); if (status == ENOSYS) device_printf(dev, "Falling back to ethp! " "Please install up to date fw\n"); return status; } static int mxge_select_firmware(mxge_softc_t *sc) { int aligned = 0; int force_firmware = mxge_force_firmware; if (sc->throttle) force_firmware = sc->throttle; if (force_firmware != 0) { if (force_firmware == 1) aligned = 1; else aligned = 0; if (mxge_verbose) device_printf(sc->dev, "Assuming %s completions (forced)\n", aligned ? "aligned" : "unaligned"); goto abort; } /* if the PCIe link width is 4 or less, we can use the aligned firmware and skip any checks */ if (sc->link_width != 0 && sc->link_width <= 4) { device_printf(sc->dev, "PCIe x%d Link, expect reduced performance\n", sc->link_width); aligned = 1; goto abort; } if (0 == mxge_firmware_probe(sc)) return 0; abort: if (aligned) { sc->fw_name = mxge_fw_aligned; sc->tx_boundary = 4096; } else { sc->fw_name = mxge_fw_unaligned; sc->tx_boundary = 2048; } return (mxge_load_firmware(sc, 0)); } static int mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr) { if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) { device_printf(sc->dev, "Bad firmware type: 0x%x\n", be32toh(hdr->mcp_type)); return EIO; } /* save firmware version for sysctl */ strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version)); if (mxge_verbose) device_printf(sc->dev, "firmware id: %s\n", hdr->version); sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major, &sc->fw_ver_minor, &sc->fw_ver_tiny); if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) { device_printf(sc->dev, "Found firmware version %s\n", sc->fw_version); device_printf(sc->dev, "Driver needs %d.%d\n", MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR); return EINVAL; } return 0; } static void * z_alloc(void *nil, u_int items, u_int size) { void *ptr; ptr = malloc(items * size, M_TEMP, M_NOWAIT); return ptr; } static void z_free(void *nil, void *ptr) { free(ptr, M_TEMP); } static int mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit) { z_stream zs; char *inflate_buffer; const struct firmware *fw; const mcp_gen_header_t *hdr; unsigned hdr_offset; int status; unsigned int i; char dummy; size_t fw_len; fw = firmware_get(sc->fw_name); if (fw == NULL) { device_printf(sc->dev, "Could not find firmware image %s\n", sc->fw_name); return ENOENT; } /* setup zlib and decompress f/w */ bzero(&zs, sizeof (zs)); zs.zalloc = z_alloc; zs.zfree = z_free; status = inflateInit(&zs); if (status != Z_OK) { status = EIO; goto abort_with_fw; } /* the uncompressed size is stored as the firmware version, which would otherwise go unused */ fw_len = (size_t) fw->version; inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT); if (inflate_buffer == NULL) goto abort_with_zs; zs.avail_in = fw->datasize; zs.next_in = __DECONST(char *, fw->data); zs.avail_out = fw_len; zs.next_out = inflate_buffer; status = inflate(&zs, Z_FINISH); if (status != Z_STREAM_END) { device_printf(sc->dev, "zlib %d\n", status); status = EIO; goto abort_with_buffer; } /* check id */ hdr_offset = htobe32(*(const uint32_t *) (inflate_buffer + MCP_HEADER_PTR_OFFSET)); if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) { device_printf(sc->dev, "Bad firmware file"); status = EIO; goto abort_with_buffer; } hdr = (const void*)(inflate_buffer + hdr_offset); status = mxge_validate_firmware(sc, hdr); if (status != 0) goto abort_with_buffer; /* Copy the inflated firmware to NIC SRAM. */ for (i = 0; i < fw_len; i += 256) { mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i, min(256U, (unsigned)(fw_len - i))); wmb(); dummy = *sc->sram; wmb(); } *limit = fw_len; status = 0; abort_with_buffer: free(inflate_buffer, M_TEMP); abort_with_zs: inflateEnd(&zs); abort_with_fw: firmware_put(fw, FIRMWARE_UNLOAD); return status; } /* * Enable or disable periodic RDMAs from the host to make certain * chipsets resend dropped PCIe messages */ static void mxge_dummy_rdma(mxge_softc_t *sc, int enable) { char buf_bytes[72]; volatile uint32_t *confirm; volatile char *submit; uint32_t *buf, dma_low, dma_high; int i; buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL); /* clear confirmation addr */ confirm = (volatile uint32_t *)sc->cmd; *confirm = 0; wmb(); /* send an rdma command to the PCIe engine, and wait for the response in the confirmation address. The firmware should write a -1 there to indicate it is alive and well */ dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); buf[0] = htobe32(dma_high); /* confirm addr MSW */ buf[1] = htobe32(dma_low); /* confirm addr LSW */ buf[2] = htobe32(0xffffffff); /* confirm data */ dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr); buf[3] = htobe32(dma_high); /* dummy addr MSW */ buf[4] = htobe32(dma_low); /* dummy addr LSW */ buf[5] = htobe32(enable); /* enable? */ submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA); mxge_pio_copy(submit, buf, 64); wmb(); DELAY(1000); wmb(); i = 0; while (*confirm != 0xffffffff && i < 20) { DELAY(1000); i++; } if (*confirm != 0xffffffff) { device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)", (enable ? "enable" : "disable"), confirm, *confirm); } return; } static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data) { mcp_cmd_t *buf; char buf_bytes[sizeof(*buf) + 8]; volatile mcp_cmd_response_t *response = sc->cmd; volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD; uint32_t dma_low, dma_high; int err, sleep_total = 0; /* ensure buf is aligned to 8 bytes */ buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL); buf->data0 = htobe32(data->data0); buf->data1 = htobe32(data->data1); buf->data2 = htobe32(data->data2); buf->cmd = htobe32(cmd); dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); buf->response_addr.low = htobe32(dma_low); buf->response_addr.high = htobe32(dma_high); mtx_lock(&sc->cmd_mtx); response->result = 0xffffffff; wmb(); mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf)); /* wait up to 20ms */ err = EAGAIN; for (sleep_total = 0; sleep_total < 20; sleep_total++) { bus_dmamap_sync(sc->cmd_dma.dmat, sc->cmd_dma.map, BUS_DMASYNC_POSTREAD); wmb(); switch (be32toh(response->result)) { case 0: data->data0 = be32toh(response->data); err = 0; break; case 0xffffffff: DELAY(1000); break; case MXGEFW_CMD_UNKNOWN: err = ENOSYS; break; case MXGEFW_CMD_ERROR_UNALIGNED: err = E2BIG; break; case MXGEFW_CMD_ERROR_BUSY: err = EBUSY; break; case MXGEFW_CMD_ERROR_I2C_ABSENT: err = ENXIO; break; default: device_printf(sc->dev, "mxge: command %d " "failed, result = %d\n", cmd, be32toh(response->result)); err = ENXIO; break; } if (err != EAGAIN) break; } if (err == EAGAIN) device_printf(sc->dev, "mxge: command %d timed out" "result = %d\n", cmd, be32toh(response->result)); mtx_unlock(&sc->cmd_mtx); return err; } static int mxge_adopt_running_firmware(mxge_softc_t *sc) { struct mcp_gen_header *hdr; const size_t bytes = sizeof (struct mcp_gen_header); size_t hdr_offset; int status; /* find running firmware header */ hdr_offset = htobe32(*(volatile uint32_t *) (sc->sram + MCP_HEADER_PTR_OFFSET)); if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) { device_printf(sc->dev, "Running firmware has bad header offset (%d)\n", (int)hdr_offset); return EIO; } /* copy header of running firmware from SRAM to host memory to * validate firmware */ hdr = malloc(bytes, M_DEVBUF, M_NOWAIT); if (hdr == NULL) { device_printf(sc->dev, "could not malloc firmware hdr\n"); return ENOMEM; } bus_space_read_region_1(rman_get_bustag(sc->mem_res), rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes); status = mxge_validate_firmware(sc, hdr); free(hdr, M_DEVBUF); /* * check to see if adopted firmware has bug where adopting * it will cause broadcasts to be filtered unless the NIC * is kept in ALLMULTI mode */ if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 && sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) { sc->adopted_rx_filter_bug = 1; device_printf(sc->dev, "Adopting fw %d.%d.%d: " "working around rx filter bug\n", sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny); } return status; } static int mxge_load_firmware(mxge_softc_t *sc, int adopt) { volatile uint32_t *confirm; volatile char *submit; char buf_bytes[72]; uint32_t *buf, size, dma_low, dma_high; int status, i; buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL); size = sc->sram_size; status = mxge_load_firmware_helper(sc, &size); if (status) { if (!adopt) return status; /* Try to use the currently running firmware, if it is new enough */ status = mxge_adopt_running_firmware(sc); if (status) { device_printf(sc->dev, "failed to adopt running firmware\n"); return status; } device_printf(sc->dev, "Successfully adopted running firmware\n"); if (sc->tx_boundary == 4096) { device_printf(sc->dev, "Using firmware currently running on NIC" ". For optimal\n"); device_printf(sc->dev, "performance consider loading optimized " "firmware\n"); } sc->fw_name = mxge_fw_unaligned; sc->tx_boundary = 2048; return 0; } /* clear confirmation addr */ confirm = (volatile uint32_t *)sc->cmd; *confirm = 0; wmb(); /* send a reload command to the bootstrap MCP, and wait for the response in the confirmation address. The firmware should write a -1 there to indicate it is alive and well */ dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); buf[0] = htobe32(dma_high); /* confirm addr MSW */ buf[1] = htobe32(dma_low); /* confirm addr LSW */ buf[2] = htobe32(0xffffffff); /* confirm data */ /* FIX: All newest firmware should un-protect the bottom of the sram before handoff. However, the very first interfaces do not. Therefore the handoff copy must skip the first 8 bytes */ /* where the code starts*/ buf[3] = htobe32(MXGE_FW_OFFSET + 8); buf[4] = htobe32(size - 8); /* length of code */ buf[5] = htobe32(8); /* where to copy to */ buf[6] = htobe32(0); /* where to jump to */ submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF); mxge_pio_copy(submit, buf, 64); wmb(); DELAY(1000); wmb(); i = 0; while (*confirm != 0xffffffff && i < 20) { DELAY(1000*10); i++; bus_dmamap_sync(sc->cmd_dma.dmat, sc->cmd_dma.map, BUS_DMASYNC_POSTREAD); } if (*confirm != 0xffffffff) { device_printf(sc->dev,"handoff failed (%p = 0x%x)", confirm, *confirm); return ENXIO; } return 0; } static int mxge_update_mac_address(mxge_softc_t *sc) { mxge_cmd_t cmd; uint8_t *addr = sc->mac_addr; int status; cmd.data0 = ((addr[0] << 24) | (addr[1] << 16) | (addr[2] << 8) | addr[3]); cmd.data1 = ((addr[4] << 8) | (addr[5])); status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd); return status; } static int mxge_change_pause(mxge_softc_t *sc, int pause) { mxge_cmd_t cmd; int status; if (pause) status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd); else status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd); if (status) { device_printf(sc->dev, "Failed to set flow control mode\n"); return ENXIO; } sc->pause = pause; return 0; } static void mxge_change_promisc(mxge_softc_t *sc, int promisc) { mxge_cmd_t cmd; int status; if (mxge_always_promisc) promisc = 1; if (promisc) status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd); else status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd); if (status) { device_printf(sc->dev, "Failed to set promisc mode\n"); } } static void mxge_set_multicast_list(mxge_softc_t *sc) { mxge_cmd_t cmd; struct ifmultiaddr *ifma; struct ifnet *ifp = sc->ifp; int err; /* This firmware is known to not support multicast */ if (!sc->fw_multicast_support) return; /* Disable multicast filtering while we play with the lists*/ err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd); if (err != 0) { device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI," " error status: %d\n", err); return; } if (sc->adopted_rx_filter_bug) return; if (ifp->if_flags & IFF_ALLMULTI) /* request to disable multicast filtering, so quit here */ return; /* Flush all the filters */ err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd); if (err != 0) { device_printf(sc->dev, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS" ", error status: %d\n", err); return; } /* Walk the multicast list, and add each address */ if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &cmd.data0, 4); bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4, &cmd.data1, 2); cmd.data0 = htonl(cmd.data0); cmd.data1 = htonl(cmd.data1); err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd); if (err != 0) { device_printf(sc->dev, "Failed " "MXGEFW_JOIN_MULTICAST_GROUP, error status:" "%d\t", err); /* abort, leaving multicast filtering off */ if_maddr_runlock(ifp); return; } } if_maddr_runlock(ifp); /* Enable multicast filtering */ err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd); if (err != 0) { device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI" ", error status: %d\n", err); } } static int mxge_max_mtu(mxge_softc_t *sc) { mxge_cmd_t cmd; int status; if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU) return MXGEFW_MAX_MTU - MXGEFW_PAD; /* try to set nbufs to see if it we can use virtually contiguous jumbos */ cmd.data0 = 0; status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd); if (status == 0) return MXGEFW_MAX_MTU - MXGEFW_PAD; /* otherwise, we're limited to MJUMPAGESIZE */ return MJUMPAGESIZE - MXGEFW_PAD; } static int mxge_reset(mxge_softc_t *sc, int interrupts_setup) { struct mxge_slice_state *ss; mxge_rx_done_t *rx_done; volatile uint32_t *irq_claim; mxge_cmd_t cmd; int slice, status; /* try to send a reset command to the card to see if it is alive */ memset(&cmd, 0, sizeof (cmd)); status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd); if (status != 0) { device_printf(sc->dev, "failed reset\n"); return ENXIO; } mxge_dummy_rdma(sc, 1); /* set the intrq size */ cmd.data0 = sc->rx_ring_size; status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd); /* * Even though we already know how many slices are supported * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES * has magic side effects, and must be called after a reset. * It must be called prior to calling any RSS related cmds, * including assigning an interrupt queue for anything but * slice 0. It must also be called *after* * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by * the firmware to compute offsets. */ if (sc->num_slices > 1) { /* ask the maximum number of slices it supports */ status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd); if (status != 0) { device_printf(sc->dev, "failed to get number of slices\n"); return status; } /* * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior * to setting up the interrupt queue DMA */ cmd.data0 = sc->num_slices; cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE; #ifdef IFNET_BUF_RING cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES; #endif status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd); if (status != 0) { device_printf(sc->dev, "failed to set number of slices\n"); return status; } } if (interrupts_setup) { /* Now exchange information about interrupts */ for (slice = 0; slice < sc->num_slices; slice++) { rx_done = &sc->ss[slice].rx_done; memset(rx_done->entry, 0, sc->rx_ring_size); cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr); cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr); cmd.data2 = slice; status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd); } } status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd); sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0); status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd); irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0); status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd); sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0); if (status != 0) { device_printf(sc->dev, "failed set interrupt parameters\n"); return status; } *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay); /* run a DMA benchmark */ (void) mxge_dma_test(sc, MXGEFW_DMA_TEST); for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; ss->irq_claim = irq_claim + (2 * slice); /* reset mcp/driver shared state back to 0 */ ss->rx_done.idx = 0; ss->rx_done.cnt = 0; ss->tx.req = 0; ss->tx.done = 0; ss->tx.pkt_done = 0; ss->tx.queue_active = 0; ss->tx.activate = 0; ss->tx.deactivate = 0; ss->tx.wake = 0; ss->tx.defrag = 0; ss->tx.stall = 0; ss->rx_big.cnt = 0; ss->rx_small.cnt = 0; ss->lc.lro_bad_csum = 0; ss->lc.lro_queued = 0; ss->lc.lro_flushed = 0; if (ss->fw_stats != NULL) { bzero(ss->fw_stats, sizeof *ss->fw_stats); } } sc->rdma_tags_available = 15; status = mxge_update_mac_address(sc); mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC); mxge_change_pause(sc, sc->pause); mxge_set_multicast_list(sc); if (sc->throttle) { cmd.data0 = sc->throttle; if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd)) { device_printf(sc->dev, "can't enable throttle\n"); } } return status; } static int mxge_change_throttle(SYSCTL_HANDLER_ARGS) { mxge_cmd_t cmd; mxge_softc_t *sc; int err; unsigned int throttle; sc = arg1; throttle = sc->throttle; err = sysctl_handle_int(oidp, &throttle, arg2, req); if (err != 0) { return err; } if (throttle == sc->throttle) return 0; if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE) return EINVAL; mtx_lock(&sc->driver_mtx); cmd.data0 = throttle; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd); if (err == 0) sc->throttle = throttle; mtx_unlock(&sc->driver_mtx); return err; } static int mxge_change_intr_coal(SYSCTL_HANDLER_ARGS) { mxge_softc_t *sc; unsigned int intr_coal_delay; int err; sc = arg1; intr_coal_delay = sc->intr_coal_delay; err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req); if (err != 0) { return err; } if (intr_coal_delay == sc->intr_coal_delay) return 0; if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000) return EINVAL; mtx_lock(&sc->driver_mtx); *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay); sc->intr_coal_delay = intr_coal_delay; mtx_unlock(&sc->driver_mtx); return err; } static int mxge_change_flow_control(SYSCTL_HANDLER_ARGS) { mxge_softc_t *sc; unsigned int enabled; int err; sc = arg1; enabled = sc->pause; err = sysctl_handle_int(oidp, &enabled, arg2, req); if (err != 0) { return err; } if (enabled == sc->pause) return 0; mtx_lock(&sc->driver_mtx); err = mxge_change_pause(sc, enabled); mtx_unlock(&sc->driver_mtx); return err; } static int mxge_handle_be32(SYSCTL_HANDLER_ARGS) { int err; if (arg1 == NULL) return EFAULT; arg2 = be32toh(*(int *)arg1); arg1 = NULL; err = sysctl_handle_int(oidp, arg1, arg2, req); return err; } static void mxge_rem_sysctls(mxge_softc_t *sc) { struct mxge_slice_state *ss; int slice; if (sc->slice_sysctl_tree == NULL) return; for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; if (ss == NULL || ss->sysctl_tree == NULL) continue; sysctl_ctx_free(&ss->sysctl_ctx); ss->sysctl_tree = NULL; } sysctl_ctx_free(&sc->slice_sysctl_ctx); sc->slice_sysctl_tree = NULL; } static void mxge_add_sysctls(mxge_softc_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; mcp_irq_data_t *fw; struct mxge_slice_state *ss; int slice; char slice_num[8]; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); fw = sc->ss[0].fw_stats; /* random information */ SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number", CTLFLAG_RD, sc->serial_number_string, 0, "serial number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code", CTLFLAG_RD, sc->product_code_string, 0, "product_code"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width", CTLFLAG_RD, &sc->link_width, 0, "tx_boundary"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary", CTLFLAG_RD, &sc->tx_boundary, 0, "tx_boundary"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine", CTLFLAG_RD, &sc->wc, 0, "write combining PIO?"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs", CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs", CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs", CTLFLAG_RD, &sc->read_write_dma, 0, "DMA concurrent Read/Write speed in MB/s"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets", CTLFLAG_RD, &sc->watchdog_resets, 0, "Number of times NIC was reset"); /* performance related tunables */ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay", CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I", "interrupt coalescing delay in usecs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle", CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I", "transmit throttling"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled", CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I", "interrupt coalescing delay in usecs"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait", CTLFLAG_RW, &mxge_deassert_wait, 0, "Wait for IRQ line to go low in ihandler"); /* stats block from firmware is in network byte order. Need to swap it */ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up", CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0, mxge_handle_be32, "I", "link up"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available", CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0, mxge_handle_be32, "I", "rdma_tags_available"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I", "dropped_bad_crc32"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I", "dropped_link_error_or_filtered"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0, mxge_handle_be32, "I", "dropped_link_overflow"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I", "dropped_multicast_filtered"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I", "dropped_no_big_buffer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I", "dropped_no_small_buffer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0, mxge_handle_be32, "I", "dropped_overrun"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I", "dropped_unicast_filtered"); /* verbose printing? */ SYSCTL_ADD_INT(ctx, children, OID_AUTO, "verbose", CTLFLAG_RW, &mxge_verbose, 0, "verbose printing"); /* add counters exported for debugging from all slices */ sysctl_ctx_init(&sc->slice_sysctl_ctx); sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO, "slice", CTLFLAG_RD, 0, ""); for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; sysctl_ctx_init(&ss->sysctl_ctx); ctx = &ss->sysctl_ctx; children = SYSCTL_CHILDREN(sc->slice_sysctl_tree); sprintf(slice_num, "%d", slice); ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num, CTLFLAG_RD, 0, ""); children = SYSCTL_CHILDREN(ss->sysctl_tree); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt", CTLFLAG_RD, &ss->rx_small.cnt, 0, "rx_small_cnt"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt", CTLFLAG_RD, &ss->rx_big.cnt, 0, "rx_small_cnt"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed, 0, "number of lro merge queues flushed"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum, 0, "number of bad csums preventing LRO"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued, 0, "number of frames appended to lro merge" "queues"); #ifndef IFNET_BUF_RING /* only transmit from slice 0 for now */ if (slice > 0) continue; #endif SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req", CTLFLAG_RD, &ss->tx.req, 0, "tx_req"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done", CTLFLAG_RD, &ss->tx.done, 0, "tx_done"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done", CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_stall", CTLFLAG_RD, &ss->tx.stall, 0, "tx_stall"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_wake", CTLFLAG_RD, &ss->tx.wake, 0, "tx_wake"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_defrag", CTLFLAG_RD, &ss->tx.defrag, 0, "tx_defrag"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active", CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate", CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate", CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate"); } } /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy backwards one at a time and handle ring wraps */ static inline void mxge_submit_req_backwards(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt) { int idx, starting_slot; starting_slot = tx->req; while (cnt > 1) { cnt--; idx = (starting_slot + cnt) & tx->mask; mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src)); wmb(); } } /* * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy * at most 32 bytes at a time, so as to avoid involving the software * pio handler in the nic. We re-write the first segment's flags * to mark them valid only after writing the entire chain */ static inline void mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt) { int idx, i; uint32_t *src_ints; volatile uint32_t *dst_ints; mcp_kreq_ether_send_t *srcp; volatile mcp_kreq_ether_send_t *dstp, *dst; uint8_t last_flags; idx = tx->req & tx->mask; last_flags = src->flags; src->flags = 0; wmb(); dst = dstp = &tx->lanai[idx]; srcp = src; if ((idx + cnt) < tx->mask) { for (i = 0; i < (cnt - 1); i += 2) { mxge_pio_copy(dstp, srcp, 2 * sizeof(*src)); wmb(); /* force write every 32 bytes */ srcp += 2; dstp += 2; } } else { /* submit all but the first request, and ensure that it is submitted below */ mxge_submit_req_backwards(tx, src, cnt); i = 0; } if (i < cnt) { /* submit the first request */ mxge_pio_copy(dstp, srcp, sizeof(*src)); wmb(); /* barrier before setting valid flag */ } /* re-write the last 32-bits with the valid flags */ src->flags = last_flags; src_ints = (uint32_t *)src; src_ints+=3; dst_ints = (volatile uint32_t *)dst; dst_ints+=3; *dst_ints = *src_ints; tx->req += cnt; wmb(); } static int mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m, struct mxge_pkt_info *pi) { struct ether_vlan_header *eh; uint16_t etype; int tso = m->m_pkthdr.csum_flags & (CSUM_TSO); #if IFCAP_TSO6 && defined(INET6) int nxt; #endif eh = mtod(m, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); pi->ip_off = ETHER_HDR_LEN; } switch (etype) { case ETHERTYPE_IP: /* * ensure ip header is in first mbuf, copy it to a * scratch buffer if not */ pi->ip = (struct ip *)(m->m_data + pi->ip_off); pi->ip6 = NULL; if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) { m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip), ss->scratch); pi->ip = (struct ip *)(ss->scratch + pi->ip_off); } pi->ip_hlen = pi->ip->ip_hl << 2; if (!tso) return 0; if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr))) { m_copydata(m, 0, pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr), ss->scratch); pi->ip = (struct ip *)(ss->scratch + pi->ip_off); } pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen); break; #if IFCAP_TSO6 && defined(INET6) case ETHERTYPE_IPV6: pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off); if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) { m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6), ss->scratch); pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off); } nxt = 0; pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt); pi->ip_hlen -= pi->ip_off; if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) return EINVAL; if (!tso) return 0; if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen) return EINVAL; if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr))) { m_copydata(m, 0, pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr), ss->scratch); pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off); } pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen); break; #endif default: return EINVAL; } return 0; } #if IFCAP_TSO4 static void mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m, int busdma_seg_cnt, struct mxge_pkt_info *pi) { mxge_tx_ring_t *tx; mcp_kreq_ether_send_t *req; bus_dma_segment_t *seg; uint32_t low, high_swapped; int len, seglen, cum_len, cum_len_next; int next_is_first, chop, cnt, rdma_count, small; uint16_t pseudo_hdr_offset, cksum_offset, mss, sum; uint8_t flags, flags_next; static int once; mss = m->m_pkthdr.tso_segsz; /* negative cum_len signifies to the * send loop that we are still in the * header portion of the TSO packet. */ cksum_offset = pi->ip_off + pi->ip_hlen; cum_len = -(cksum_offset + (pi->tcp->th_off << 2)); /* TSO implies checksum offload on this hardware */ if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) { /* * If packet has full TCP csum, replace it with pseudo hdr * sum that the NIC expects, otherwise the NIC will emit * packets with bad TCP checksums. */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (pi->ip6) { #if (CSUM_TCP_IPV6 != 0) && defined(INET6) m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6; sum = in6_cksum_pseudo(pi->ip6, m->m_pkthdr.len - cksum_offset, IPPROTO_TCP, 0); #endif } else { #ifdef INET m->m_pkthdr.csum_flags |= CSUM_TCP; sum = in_pseudo(pi->ip->ip_src.s_addr, pi->ip->ip_dst.s_addr, htons(IPPROTO_TCP + (m->m_pkthdr.len - cksum_offset))); #endif } m_copyback(m, offsetof(struct tcphdr, th_sum) + cksum_offset, sizeof(sum), (caddr_t)&sum); } flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST; /* for TSO, pseudo_hdr_offset holds mss. * The firmware figures out where to put * the checksum by parsing the header. */ pseudo_hdr_offset = htobe16(mss); if (pi->ip6) { /* * for IPv6 TSO, the "checksum offset" is re-purposed * to store the TCP header len */ cksum_offset = (pi->tcp->th_off << 2); } tx = &ss->tx; req = tx->req_list; seg = tx->seg_list; cnt = 0; rdma_count = 0; /* "rdma_count" is the number of RDMAs belonging to the * current packet BEFORE the current send request. For * non-TSO packets, this is equal to "count". * For TSO packets, rdma_count needs to be reset * to 0 after a segment cut. * * The rdma_count field of the send request is * the number of RDMAs of the packet starting at * that request. For TSO send requests with one ore more cuts * in the middle, this is the number of RDMAs starting * after the last cut in the request. All previous * segments before the last cut implicitly have 1 RDMA. * * Since the number of RDMAs is not known beforehand, * it must be filled-in retroactively - after each * segmentation cut or at the end of the entire packet. */ while (busdma_seg_cnt) { /* Break the busdma segment up into pieces*/ low = MXGE_LOWPART_TO_U32(seg->ds_addr); high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); len = seg->ds_len; while (len) { flags_next = flags & ~MXGEFW_FLAGS_FIRST; seglen = len; cum_len_next = cum_len + seglen; (req-rdma_count)->rdma_count = rdma_count + 1; if (__predict_true(cum_len >= 0)) { /* payload */ chop = (cum_len_next > mss); cum_len_next = cum_len_next % mss; next_is_first = (cum_len_next == 0); flags |= chop * MXGEFW_FLAGS_TSO_CHOP; flags_next |= next_is_first * MXGEFW_FLAGS_FIRST; rdma_count |= -(chop | next_is_first); rdma_count += chop & !next_is_first; } else if (cum_len_next >= 0) { /* header ends */ rdma_count = -1; cum_len_next = 0; seglen = -cum_len; small = (mss <= MXGEFW_SEND_SMALL_SIZE); flags_next = MXGEFW_FLAGS_TSO_PLD | MXGEFW_FLAGS_FIRST | (small * MXGEFW_FLAGS_SMALL); } req->addr_high = high_swapped; req->addr_low = htobe32(low); req->pseudo_hdr_offset = pseudo_hdr_offset; req->pad = 0; req->rdma_count = 1; req->length = htobe16(seglen); req->cksum_offset = cksum_offset; req->flags = flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD); low += seglen; len -= seglen; cum_len = cum_len_next; flags = flags_next; req++; cnt++; rdma_count++; if (cksum_offset != 0 && !pi->ip6) { if (__predict_false(cksum_offset > seglen)) cksum_offset -= seglen; else cksum_offset = 0; } if (__predict_false(cnt > tx->max_desc)) goto drop; } busdma_seg_cnt--; seg++; } (req-rdma_count)->rdma_count = rdma_count; do { req--; req->flags |= MXGEFW_FLAGS_TSO_LAST; } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST))); tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1; mxge_submit_req(tx, tx->req_list, cnt); #ifdef IFNET_BUF_RING if ((ss->sc->num_slices > 1) && tx->queue_active == 0) { /* tell the NIC to start polling this slice */ *tx->send_go = 1; tx->queue_active = 1; tx->activate++; wmb(); } #endif return; drop: bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map); m_freem(m); ss->oerrors++; if (!once) { printf("tx->max_desc exceeded via TSO!\n"); printf("mss = %d, %ld, %d!\n", mss, (long)seg - (long)tx->seg_list, tx->max_desc); once = 1; } return; } #endif /* IFCAP_TSO4 */ #ifdef MXGE_NEW_VLAN_API /* * We reproduce the software vlan tag insertion from * net/if_vlan.c:vlan_start() here so that we can advertise "hardware" * vlan tag insertion. We need to advertise this in order to have the * vlan interface respect our csum offload flags. */ static struct mbuf * mxge_vlan_tag_insert(struct mbuf *m) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (__predict_false(m == NULL)) return NULL; if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (__predict_false(m == NULL)) return NULL; } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_flags &= ~M_VLANTAG; return m; } #endif /* MXGE_NEW_VLAN_API */ static void mxge_encap(struct mxge_slice_state *ss, struct mbuf *m) { struct mxge_pkt_info pi = {0,0,0,0}; mxge_softc_t *sc; mcp_kreq_ether_send_t *req; bus_dma_segment_t *seg; struct mbuf *m_tmp; struct ifnet *ifp; mxge_tx_ring_t *tx; int cnt, cum_len, err, i, idx, odd_flag; uint16_t pseudo_hdr_offset; uint8_t flags, cksum_offset; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; #ifdef MXGE_NEW_VLAN_API if (m->m_flags & M_VLANTAG) { m = mxge_vlan_tag_insert(m); if (__predict_false(m == NULL)) goto drop_without_m; } #endif if (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) { if (mxge_parse_tx(ss, m, &pi)) goto drop; } /* (try to) map the frame for DMA */ idx = tx->req & tx->mask; err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map, m, tx->seg_list, &cnt, BUS_DMA_NOWAIT); if (__predict_false(err == EFBIG)) { /* Too many segments in the chain. Try to defrag */ m_tmp = m_defrag(m, M_NOWAIT); if (m_tmp == NULL) { goto drop; } ss->tx.defrag++; m = m_tmp; err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map, m, tx->seg_list, &cnt, BUS_DMA_NOWAIT); } if (__predict_false(err != 0)) { device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d" " packet len = %d\n", err, m->m_pkthdr.len); goto drop; } bus_dmamap_sync(tx->dmat, tx->info[idx].map, BUS_DMASYNC_PREWRITE); tx->info[idx].m = m; #if IFCAP_TSO4 /* TSO is different enough, we handle it in another routine */ if (m->m_pkthdr.csum_flags & (CSUM_TSO)) { mxge_encap_tso(ss, m, cnt, &pi); return; } #endif req = tx->req_list; cksum_offset = 0; pseudo_hdr_offset = 0; flags = MXGEFW_FLAGS_NO_TSO; /* checksum offloading? */ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) { /* ensure ip header is in first mbuf, copy it to a scratch buffer if not */ cksum_offset = pi.ip_off + pi.ip_hlen; pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data; pseudo_hdr_offset = htobe16(pseudo_hdr_offset); req->cksum_offset = cksum_offset; flags |= MXGEFW_FLAGS_CKSUM; odd_flag = MXGEFW_FLAGS_ALIGN_ODD; } else { odd_flag = 0; } if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE) flags |= MXGEFW_FLAGS_SMALL; /* convert segments into a request list */ cum_len = 0; seg = tx->seg_list; req->flags = MXGEFW_FLAGS_FIRST; for (i = 0; i < cnt; i++) { req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr)); req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); req->length = htobe16(seg->ds_len); req->cksum_offset = cksum_offset; if (cksum_offset > seg->ds_len) cksum_offset -= seg->ds_len; else cksum_offset = 0; req->pseudo_hdr_offset = pseudo_hdr_offset; req->pad = 0; /* complete solid 16-byte block */ req->rdma_count = 1; req->flags |= flags | ((cum_len & 1) * odd_flag); cum_len += seg->ds_len; seg++; req++; req->flags = 0; } req--; /* pad runts to 60 bytes */ if (cum_len < 60) { req++; req->addr_low = htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr)); req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr)); req->length = htobe16(60 - cum_len); req->cksum_offset = 0; req->pseudo_hdr_offset = pseudo_hdr_offset; req->pad = 0; /* complete solid 16-byte block */ req->rdma_count = 1; req->flags |= flags | ((cum_len & 1) * odd_flag); cnt++; } tx->req_list[0].rdma_count = cnt; #if 0 /* print what the firmware will see */ for (i = 0; i < cnt; i++) { printf("%d: addr: 0x%x 0x%x len:%d pso%d," "cso:%d, flags:0x%x, rdma:%d\n", i, (int)ntohl(tx->req_list[i].addr_high), (int)ntohl(tx->req_list[i].addr_low), (int)ntohs(tx->req_list[i].length), (int)ntohs(tx->req_list[i].pseudo_hdr_offset), tx->req_list[i].cksum_offset, tx->req_list[i].flags, tx->req_list[i].rdma_count); } printf("--------------\n"); #endif tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1; mxge_submit_req(tx, tx->req_list, cnt); #ifdef IFNET_BUF_RING if ((ss->sc->num_slices > 1) && tx->queue_active == 0) { /* tell the NIC to start polling this slice */ *tx->send_go = 1; tx->queue_active = 1; tx->activate++; wmb(); } #endif return; drop: m_freem(m); drop_without_m: ss->oerrors++; return; } #ifdef IFNET_BUF_RING static void mxge_qflush(struct ifnet *ifp) { mxge_softc_t *sc = ifp->if_softc; mxge_tx_ring_t *tx; struct mbuf *m; int slice; for (slice = 0; slice < sc->num_slices; slice++) { tx = &sc->ss[slice].tx; mtx_lock(&tx->mtx); while ((m = buf_ring_dequeue_sc(tx->br)) != NULL) m_freem(m); mtx_unlock(&tx->mtx); } if_qflush(ifp); } static inline void mxge_start_locked(struct mxge_slice_state *ss) { mxge_softc_t *sc; struct mbuf *m; struct ifnet *ifp; mxge_tx_ring_t *tx; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) { m = drbr_dequeue(ifp, tx->br); if (m == NULL) { return; } /* let BPF see it */ BPF_MTAP(ifp, m); /* give it to the nic */ mxge_encap(ss, m); } /* ran out of transmit slots */ if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0) && (!drbr_empty(ifp, tx->br))) { ss->if_drv_flags |= IFF_DRV_OACTIVE; tx->stall++; } } static int mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m) { mxge_softc_t *sc; struct ifnet *ifp; mxge_tx_ring_t *tx; int err; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) { err = drbr_enqueue(ifp, tx->br, m); return (err); } if (!drbr_needs_enqueue(ifp, tx->br) && ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) { /* let BPF see it */ BPF_MTAP(ifp, m); /* give it to the nic */ mxge_encap(ss, m); } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) { return (err); } if (!drbr_empty(ifp, tx->br)) mxge_start_locked(ss); return (0); } static int mxge_transmit(struct ifnet *ifp, struct mbuf *m) { mxge_softc_t *sc = ifp->if_softc; struct mxge_slice_state *ss; mxge_tx_ring_t *tx; int err = 0; int slice; slice = m->m_pkthdr.flowid; slice &= (sc->num_slices - 1); /* num_slices always power of 2 */ ss = &sc->ss[slice]; tx = &ss->tx; if (mtx_trylock(&tx->mtx)) { err = mxge_transmit_locked(ss, m); mtx_unlock(&tx->mtx); } else { err = drbr_enqueue(ifp, tx->br, m); } return (err); } #else static inline void mxge_start_locked(struct mxge_slice_state *ss) { mxge_softc_t *sc; struct mbuf *m; struct ifnet *ifp; mxge_tx_ring_t *tx; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { return; } /* let BPF see it */ BPF_MTAP(ifp, m); /* give it to the nic */ mxge_encap(ss, m); } /* ran out of transmit slots */ if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE; tx->stall++; } } #endif static void mxge_start(struct ifnet *ifp) { mxge_softc_t *sc = ifp->if_softc; struct mxge_slice_state *ss; /* only use the first slice for now */ ss = &sc->ss[0]; mtx_lock(&ss->tx.mtx); mxge_start_locked(ss); mtx_unlock(&ss->tx.mtx); } /* * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy * at most 32 bytes at a time, so as to avoid involving the software * pio handler in the nic. We re-write the first segment's low * DMA address to mark it valid only after we write the entire chunk * in a burst */ static inline void mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src) { uint32_t low; low = src->addr_low; src->addr_low = 0xffffffff; mxge_pio_copy(dst, src, 4 * sizeof (*src)); wmb(); mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src)); wmb(); src->addr_low = low; dst->addr_low = low; wmb(); } static int mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx) { bus_dma_segment_t seg; struct mbuf *m; mxge_rx_ring_t *rx = &ss->rx_small; int cnt, err; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { rx->alloc_fail++; err = ENOBUFS; goto done; } m->m_len = MHLEN; err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, &seg, &cnt, BUS_DMA_NOWAIT); if (err != 0) { m_free(m); goto done; } rx->info[idx].m = m; rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr)); rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr)); done: if ((idx & 7) == 7) mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]); return err; } static int mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx) { bus_dma_segment_t seg[3]; struct mbuf *m; mxge_rx_ring_t *rx = &ss->rx_big; int cnt, err, i; m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size); if (m == NULL) { rx->alloc_fail++; err = ENOBUFS; goto done; } m->m_len = rx->mlen; err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, seg, &cnt, BUS_DMA_NOWAIT); if (err != 0) { m_free(m); goto done; } rx->info[idx].m = m; rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr)); rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); #if MXGE_VIRT_JUMBOS for (i = 1; i < cnt; i++) { rx->shadow[idx + i].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr)); rx->shadow[idx + i].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr)); } #endif done: for (i = 0; i < rx->nbufs; i++) { if ((idx & 7) == 7) { mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]); } idx++; } return err; } #ifdef INET6 static uint16_t mxge_csum_generic(uint16_t *raw, int len) { uint32_t csum; csum = 0; while (len > 0) { csum += *raw; raw++; len -= 2; } csum = (csum >> 16) + (csum & 0xffff); csum = (csum >> 16) + (csum & 0xffff); return (uint16_t)csum; } static inline uint16_t mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum) { uint32_t partial; int nxt, cksum_offset; struct ip6_hdr *ip6 = p; uint16_t c; nxt = ip6->ip6_nxt; cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN; if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) { cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN, IPPROTO_IPV6, &nxt); if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) return (1); } /* * IPv6 headers do not contain a checksum, and hence * do not checksum to zero, so they don't "fall out" * of the partial checksum calculation like IPv4 * headers do. We need to fix the partial checksum by * subtracting the checksum of the IPv6 header. */ partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset - ETHER_HDR_LEN); csum += ~partial; csum += (csum < ~partial); csum = (csum >> 16) + (csum & 0xFFFF); csum = (csum >> 16) + (csum & 0xFFFF); c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt, csum); c ^= 0xffff; return (c); } #endif /* INET6 */ /* * Myri10GE hardware checksums are not valid if the sender * padded the frame with non-zero padding. This is because * the firmware just does a simple 16-bit 1s complement * checksum across the entire frame, excluding the first 14 * bytes. It is best to simply to check the checksum and * tell the stack about it only if the checksum is good */ static inline uint16_t mxge_rx_csum(struct mbuf *m, int csum) { struct ether_header *eh; #ifdef INET struct ip *ip; #endif #if defined(INET) || defined(INET6) int cap = m->m_pkthdr.rcvif->if_capenable; #endif uint16_t c, etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); switch (etype) { #ifdef INET case ETHERTYPE_IP: if ((cap & IFCAP_RXCSUM) == 0) return (1); ip = (struct ip *)(eh + 1); if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) return (1); c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(ntohs(csum) + ntohs(ip->ip_len) - (ip->ip_hl << 2) + ip->ip_p)); c ^= 0xffff; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: if ((cap & IFCAP_RXCSUM_IPV6) == 0) return (1); c = mxge_rx_csum6((eh + 1), m, csum); break; #endif default: c = 1; } return (c); } static void mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum) { struct ether_vlan_header *evl; struct ether_header *eh; uint32_t partial; evl = mtod(m, struct ether_vlan_header *); eh = mtod(m, struct ether_header *); /* * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes * after what the firmware thought was the end of the ethernet * header. */ /* put checksum into host byte order */ *csum = ntohs(*csum); partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN)); (*csum) += ~partial; (*csum) += ((*csum) < ~partial); (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF); (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF); /* restore checksum to network byte order; later consumers expect this */ *csum = htons(*csum); /* save the tag */ #ifdef MXGE_NEW_VLAN_API m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); #else { struct m_tag *mtag; mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int), M_NOWAIT); if (mtag == NULL) return; VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag); m_tag_prepend(m, mtag); } #endif m->m_flags |= M_VLANTAG; /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } static inline void mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum, int lro) { mxge_softc_t *sc; struct ifnet *ifp; struct mbuf *m; struct ether_header *eh; mxge_rx_ring_t *rx; bus_dmamap_t old_map; int idx; sc = ss->sc; ifp = sc->ifp; rx = &ss->rx_big; idx = rx->cnt & rx->mask; rx->cnt += rx->nbufs; /* save a pointer to the received mbuf */ m = rx->info[idx].m; /* try to replace the received mbuf */ if (mxge_get_buf_big(ss, rx->extra_map, idx)) { /* drop the frame -- the old mbuf is re-cycled */ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } /* unmap the received buffer */ old_map = rx->info[idx].map; bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rx->dmat, old_map); /* swap the bus_dmamap_t's */ rx->info[idx].map = rx->extra_map; rx->extra_map = old_map; /* mcp implicitly skips 1st 2 bytes so that packet is properly * aligned */ m->m_data += MXGEFW_PAD; m->m_pkthdr.rcvif = ifp; m->m_len = m->m_pkthdr.len = len; ss->ipackets++; eh = mtod(m, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { mxge_vlan_tag_remove(m, &csum); } /* if the checksum is valid, mark it in the mbuf header */ if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) && (0 == mxge_rx_csum(m, csum))) { /* Tell the stack that the checksum is good */ m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID; #if defined(INET) || defined (INET6) if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0))) return; #endif } /* flowid only valid if RSS hashing is enabled */ if (sc->num_slices > 1) { m->m_pkthdr.flowid = (ss - sc->ss); - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } /* pass the frame up the stack */ (*ifp->if_input)(ifp, m); } static inline void mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum, int lro) { mxge_softc_t *sc; struct ifnet *ifp; struct ether_header *eh; struct mbuf *m; mxge_rx_ring_t *rx; bus_dmamap_t old_map; int idx; sc = ss->sc; ifp = sc->ifp; rx = &ss->rx_small; idx = rx->cnt & rx->mask; rx->cnt++; /* save a pointer to the received mbuf */ m = rx->info[idx].m; /* try to replace the received mbuf */ if (mxge_get_buf_small(ss, rx->extra_map, idx)) { /* drop the frame -- the old mbuf is re-cycled */ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } /* unmap the received buffer */ old_map = rx->info[idx].map; bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rx->dmat, old_map); /* swap the bus_dmamap_t's */ rx->info[idx].map = rx->extra_map; rx->extra_map = old_map; /* mcp implicitly skips 1st 2 bytes so that packet is properly * aligned */ m->m_data += MXGEFW_PAD; m->m_pkthdr.rcvif = ifp; m->m_len = m->m_pkthdr.len = len; ss->ipackets++; eh = mtod(m, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { mxge_vlan_tag_remove(m, &csum); } /* if the checksum is valid, mark it in the mbuf header */ if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) && (0 == mxge_rx_csum(m, csum))) { /* Tell the stack that the checksum is good */ m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID; #if defined(INET) || defined (INET6) if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum))) return; #endif } /* flowid only valid if RSS hashing is enabled */ if (sc->num_slices > 1) { m->m_pkthdr.flowid = (ss - sc->ss); - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } /* pass the frame up the stack */ (*ifp->if_input)(ifp, m); } static inline void mxge_clean_rx_done(struct mxge_slice_state *ss) { mxge_rx_done_t *rx_done = &ss->rx_done; int limit = 0; uint16_t length; uint16_t checksum; int lro; lro = ss->sc->ifp->if_capenable & IFCAP_LRO; while (rx_done->entry[rx_done->idx].length != 0) { length = ntohs(rx_done->entry[rx_done->idx].length); rx_done->entry[rx_done->idx].length = 0; checksum = rx_done->entry[rx_done->idx].checksum; if (length <= (MHLEN - MXGEFW_PAD)) mxge_rx_done_small(ss, length, checksum, lro); else mxge_rx_done_big(ss, length, checksum, lro); rx_done->cnt++; rx_done->idx = rx_done->cnt & rx_done->mask; /* limit potential for livelock */ if (__predict_false(++limit > rx_done->mask / 2)) break; } #if defined(INET) || defined (INET6) while (!SLIST_EMPTY(&ss->lc.lro_active)) { struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active); SLIST_REMOVE_HEAD(&ss->lc.lro_active, next); tcp_lro_flush(&ss->lc, lro); } #endif } static inline void mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx) { struct ifnet *ifp; mxge_tx_ring_t *tx; struct mbuf *m; bus_dmamap_t map; int idx; int *flags; tx = &ss->tx; ifp = ss->sc->ifp; while (tx->pkt_done != mcp_idx) { idx = tx->done & tx->mask; tx->done++; m = tx->info[idx].m; /* mbuf and DMA map only attached to the first segment per-mbuf */ if (m != NULL) { ss->obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) ss->omcasts++; ss->opackets++; tx->info[idx].m = NULL; map = tx->info[idx].map; bus_dmamap_unload(tx->dmat, map); m_freem(m); } if (tx->info[idx].flag) { tx->info[idx].flag = 0; tx->pkt_done++; } } /* If we have space, clear IFF_OACTIVE to tell the stack that its OK to send packets */ #ifdef IFNET_BUF_RING flags = &ss->if_drv_flags; #else flags = &ifp->if_drv_flags; #endif mtx_lock(&ss->tx.mtx); if ((*flags) & IFF_DRV_OACTIVE && tx->req - tx->done < (tx->mask + 1)/4) { *(flags) &= ~IFF_DRV_OACTIVE; ss->tx.wake++; mxge_start_locked(ss); } #ifdef IFNET_BUF_RING if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) { /* let the NIC stop polling this queue, since there * are no more transmits pending */ if (tx->req == tx->done) { *tx->send_stop = 1; tx->queue_active = 0; tx->deactivate++; wmb(); } } #endif mtx_unlock(&ss->tx.mtx); } static struct mxge_media_type mxge_xfp_media_types[] = { {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"}, {IFM_10G_SR, (1 << 7), "10GBASE-SR"}, {IFM_10G_LR, (1 << 6), "10GBASE-LR"}, {0, (1 << 5), "10GBASE-ER"}, {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"}, {0, (1 << 3), "10GBASE-SW"}, {0, (1 << 2), "10GBASE-LW"}, {0, (1 << 1), "10GBASE-EW"}, {0, (1 << 0), "Reserved"} }; static struct mxge_media_type mxge_sfp_media_types[] = { {IFM_10G_TWINAX, 0, "10GBASE-Twinax"}, {0, (1 << 7), "Reserved"}, {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"}, {IFM_10G_LR, (1 << 5), "10GBASE-LR"}, {IFM_10G_SR, (1 << 4), "10GBASE-SR"}, {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"} }; static void mxge_media_set(mxge_softc_t *sc, int media_type) { ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type); sc->current_media = media_type; sc->media.ifm_media = sc->media.ifm_cur->ifm_media; } static void mxge_media_init(mxge_softc_t *sc) { char *ptr; int i; ifmedia_removeall(&sc->media); mxge_media_set(sc, IFM_AUTO); /* * parse the product code to deterimine the interface type * (CX4, XFP, Quad Ribbon Fiber) by looking at the character * after the 3rd dash in the driver's cached copy of the * EEPROM's product code string. */ ptr = sc->product_code_string; if (ptr == NULL) { device_printf(sc->dev, "Missing product code\n"); return; } for (i = 0; i < 3; i++, ptr++) { ptr = strchr(ptr, '-'); if (ptr == NULL) { device_printf(sc->dev, "only %d dashes in PC?!?\n", i); return; } } if (*ptr == 'C' || *(ptr +1) == 'C') { /* -C is CX4 */ sc->connector = MXGE_CX4; mxge_media_set(sc, IFM_10G_CX4); } else if (*ptr == 'Q') { /* -Q is Quad Ribbon Fiber */ sc->connector = MXGE_QRF; device_printf(sc->dev, "Quad Ribbon Fiber Media\n"); /* FreeBSD has no media type for Quad ribbon fiber */ } else if (*ptr == 'R') { /* -R is XFP */ sc->connector = MXGE_XFP; } else if (*ptr == 'S' || *(ptr +1) == 'S') { /* -S or -2S is SFP+ */ sc->connector = MXGE_SFP; } else { device_printf(sc->dev, "Unknown media type: %c\n", *ptr); } } /* * Determine the media type for a NIC. Some XFPs will identify * themselves only when their link is up, so this is initiated via a * link up interrupt. However, this can potentially take up to * several milliseconds, so it is run via the watchdog routine, rather * than in the interrupt handler itself. */ static void mxge_media_probe(mxge_softc_t *sc) { mxge_cmd_t cmd; char *cage_type; struct mxge_media_type *mxge_media_types = NULL; int i, err, ms, mxge_media_type_entries; uint32_t byte; sc->need_media_probe = 0; if (sc->connector == MXGE_XFP) { /* -R is XFP */ mxge_media_types = mxge_xfp_media_types; mxge_media_type_entries = sizeof (mxge_xfp_media_types) / sizeof (mxge_xfp_media_types[0]); byte = MXGE_XFP_COMPLIANCE_BYTE; cage_type = "XFP"; } else if (sc->connector == MXGE_SFP) { /* -S or -2S is SFP+ */ mxge_media_types = mxge_sfp_media_types; mxge_media_type_entries = sizeof (mxge_sfp_media_types) / sizeof (mxge_sfp_media_types[0]); cage_type = "SFP+"; byte = 3; } else { /* nothing to do; media type cannot change */ return; } /* * At this point we know the NIC has an XFP cage, so now we * try to determine what is in the cage by using the * firmware's XFP I2C commands to read the XFP 10GbE compilance * register. We read just one byte, which may take over * a millisecond */ cmd.data0 = 0; /* just fetch 1 byte, not all 256 */ cmd.data1 = byte; err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd); if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) { device_printf(sc->dev, "failed to read XFP\n"); } if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) { device_printf(sc->dev, "Type R/S with no XFP!?!?\n"); } if (err != MXGEFW_CMD_OK) { return; } /* now we wait for the data to be cached */ cmd.data0 = byte; err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd); for (ms = 0; (err == EBUSY) && (ms < 50); ms++) { DELAY(1000); cmd.data0 = byte; err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd); } if (err != MXGEFW_CMD_OK) { device_printf(sc->dev, "failed to read %s (%d, %dms)\n", cage_type, err, ms); return; } if (cmd.data0 == mxge_media_types[0].bitmask) { if (mxge_verbose) device_printf(sc->dev, "%s:%s\n", cage_type, mxge_media_types[0].name); if (sc->current_media != mxge_media_types[0].flag) { mxge_media_init(sc); mxge_media_set(sc, mxge_media_types[0].flag); } return; } for (i = 1; i < mxge_media_type_entries; i++) { if (cmd.data0 & mxge_media_types[i].bitmask) { if (mxge_verbose) device_printf(sc->dev, "%s:%s\n", cage_type, mxge_media_types[i].name); if (sc->current_media != mxge_media_types[i].flag) { mxge_media_init(sc); mxge_media_set(sc, mxge_media_types[i].flag); } return; } } if (mxge_verbose) device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type, cmd.data0); return; } static void mxge_intr(void *arg) { struct mxge_slice_state *ss = arg; mxge_softc_t *sc = ss->sc; mcp_irq_data_t *stats = ss->fw_stats; mxge_tx_ring_t *tx = &ss->tx; mxge_rx_done_t *rx_done = &ss->rx_done; uint32_t send_done_count; uint8_t valid; #ifndef IFNET_BUF_RING /* an interrupt on a non-zero slice is implicitly valid since MSI-X irqs are not shared */ if (ss != sc->ss) { mxge_clean_rx_done(ss); *ss->irq_claim = be32toh(3); return; } #endif /* make sure the DMA has finished */ if (!stats->valid) { return; } valid = stats->valid; if (sc->legacy_irq) { /* lower legacy IRQ */ *sc->irq_deassert = 0; if (!mxge_deassert_wait) /* don't wait for conf. that irq is low */ stats->valid = 0; } else { stats->valid = 0; } /* loop while waiting for legacy irq deassertion */ do { /* check for transmit completes and receives */ send_done_count = be32toh(stats->send_done_count); while ((send_done_count != tx->pkt_done) || (rx_done->entry[rx_done->idx].length != 0)) { if (send_done_count != tx->pkt_done) mxge_tx_done(ss, (int)send_done_count); mxge_clean_rx_done(ss); send_done_count = be32toh(stats->send_done_count); } if (sc->legacy_irq && mxge_deassert_wait) wmb(); } while (*((volatile uint8_t *) &stats->valid)); /* fw link & error stats meaningful only on the first slice */ if (__predict_false((ss == sc->ss) && stats->stats_updated)) { if (sc->link_state != stats->link_up) { sc->link_state = stats->link_up; if (sc->link_state) { if_link_state_change(sc->ifp, LINK_STATE_UP); if (mxge_verbose) device_printf(sc->dev, "link up\n"); } else { if_link_state_change(sc->ifp, LINK_STATE_DOWN); if (mxge_verbose) device_printf(sc->dev, "link down\n"); } sc->need_media_probe = 1; } if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) { sc->rdma_tags_available = be32toh(stats->rdma_tags_available); device_printf(sc->dev, "RDMA timed out! %d tags " "left\n", sc->rdma_tags_available); } if (stats->link_down) { sc->down_cnt += stats->link_down; sc->link_state = 0; if_link_state_change(sc->ifp, LINK_STATE_DOWN); } } /* check to see if we have rx token to pass back */ if (valid & 0x1) *ss->irq_claim = be32toh(3); *(ss->irq_claim + 1) = be32toh(3); } static void mxge_init(void *arg) { mxge_softc_t *sc = arg; struct ifnet *ifp = sc->ifp; mtx_lock(&sc->driver_mtx); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) (void) mxge_open(sc); mtx_unlock(&sc->driver_mtx); } static void mxge_free_slice_mbufs(struct mxge_slice_state *ss) { int i; #if defined(INET) || defined(INET6) tcp_lro_free(&ss->lc); #endif for (i = 0; i <= ss->rx_big.mask; i++) { if (ss->rx_big.info[i].m == NULL) continue; bus_dmamap_unload(ss->rx_big.dmat, ss->rx_big.info[i].map); m_freem(ss->rx_big.info[i].m); ss->rx_big.info[i].m = NULL; } for (i = 0; i <= ss->rx_small.mask; i++) { if (ss->rx_small.info[i].m == NULL) continue; bus_dmamap_unload(ss->rx_small.dmat, ss->rx_small.info[i].map); m_freem(ss->rx_small.info[i].m); ss->rx_small.info[i].m = NULL; } /* transmit ring used only on the first slice */ if (ss->tx.info == NULL) return; for (i = 0; i <= ss->tx.mask; i++) { ss->tx.info[i].flag = 0; if (ss->tx.info[i].m == NULL) continue; bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map); m_freem(ss->tx.info[i].m); ss->tx.info[i].m = NULL; } } static void mxge_free_mbufs(mxge_softc_t *sc) { int slice; for (slice = 0; slice < sc->num_slices; slice++) mxge_free_slice_mbufs(&sc->ss[slice]); } static void mxge_free_slice_rings(struct mxge_slice_state *ss) { int i; if (ss->rx_done.entry != NULL) mxge_dma_free(&ss->rx_done.dma); ss->rx_done.entry = NULL; if (ss->tx.req_bytes != NULL) free(ss->tx.req_bytes, M_DEVBUF); ss->tx.req_bytes = NULL; if (ss->tx.seg_list != NULL) free(ss->tx.seg_list, M_DEVBUF); ss->tx.seg_list = NULL; if (ss->rx_small.shadow != NULL) free(ss->rx_small.shadow, M_DEVBUF); ss->rx_small.shadow = NULL; if (ss->rx_big.shadow != NULL) free(ss->rx_big.shadow, M_DEVBUF); ss->rx_big.shadow = NULL; if (ss->tx.info != NULL) { if (ss->tx.dmat != NULL) { for (i = 0; i <= ss->tx.mask; i++) { bus_dmamap_destroy(ss->tx.dmat, ss->tx.info[i].map); } bus_dma_tag_destroy(ss->tx.dmat); } free(ss->tx.info, M_DEVBUF); } ss->tx.info = NULL; if (ss->rx_small.info != NULL) { if (ss->rx_small.dmat != NULL) { for (i = 0; i <= ss->rx_small.mask; i++) { bus_dmamap_destroy(ss->rx_small.dmat, ss->rx_small.info[i].map); } bus_dmamap_destroy(ss->rx_small.dmat, ss->rx_small.extra_map); bus_dma_tag_destroy(ss->rx_small.dmat); } free(ss->rx_small.info, M_DEVBUF); } ss->rx_small.info = NULL; if (ss->rx_big.info != NULL) { if (ss->rx_big.dmat != NULL) { for (i = 0; i <= ss->rx_big.mask; i++) { bus_dmamap_destroy(ss->rx_big.dmat, ss->rx_big.info[i].map); } bus_dmamap_destroy(ss->rx_big.dmat, ss->rx_big.extra_map); bus_dma_tag_destroy(ss->rx_big.dmat); } free(ss->rx_big.info, M_DEVBUF); } ss->rx_big.info = NULL; } static void mxge_free_rings(mxge_softc_t *sc) { int slice; for (slice = 0; slice < sc->num_slices; slice++) mxge_free_slice_rings(&sc->ss[slice]); } static int mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries, int tx_ring_entries) { mxge_softc_t *sc = ss->sc; size_t bytes; int err, i; /* allocate per-slice receive resources */ ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1; ss->rx_done.mask = (2 * rx_ring_entries) - 1; /* allocate the rx shadow rings */ bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow); ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow); ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); /* allocate the rx host info rings */ bytes = rx_ring_entries * sizeof (*ss->rx_small.info); ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); bytes = rx_ring_entries * sizeof (*ss->rx_big.info); ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); /* allocate the rx busdma resources */ err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 1, /* alignment */ 4096, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ MHLEN, /* maxsize */ 1, /* num segs */ MHLEN, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, NULL, /* lock */ &ss->rx_small.dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating rx_small dmat\n", err); return err; } err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 1, /* alignment */ #if MXGE_VIRT_JUMBOS 4096, /* boundary */ #else 0, /* boundary */ #endif BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ 3*4096, /* maxsize */ #if MXGE_VIRT_JUMBOS 3, /* num segs */ 4096, /* maxsegsize*/ #else 1, /* num segs */ MJUM9BYTES, /* maxsegsize*/ #endif BUS_DMA_ALLOCNOW, /* flags */ NULL, NULL, /* lock */ &ss->rx_big.dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating rx_big dmat\n", err); return err; } for (i = 0; i <= ss->rx_small.mask; i++) { err = bus_dmamap_create(ss->rx_small.dmat, 0, &ss->rx_small.info[i].map); if (err != 0) { device_printf(sc->dev, "Err %d rx_small dmamap\n", err); return err; } } err = bus_dmamap_create(ss->rx_small.dmat, 0, &ss->rx_small.extra_map); if (err != 0) { device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err); return err; } for (i = 0; i <= ss->rx_big.mask; i++) { err = bus_dmamap_create(ss->rx_big.dmat, 0, &ss->rx_big.info[i].map); if (err != 0) { device_printf(sc->dev, "Err %d rx_big dmamap\n", err); return err; } } err = bus_dmamap_create(ss->rx_big.dmat, 0, &ss->rx_big.extra_map); if (err != 0) { device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err); return err; } /* now allocate TX resources */ #ifndef IFNET_BUF_RING /* only use a single TX ring for now */ if (ss != ss->sc->ss) return 0; #endif ss->tx.mask = tx_ring_entries - 1; ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4); /* allocate the tx request copy block */ bytes = 8 + sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4); ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK); /* ensure req_list entries are aligned to 8 bytes */ ss->tx.req_list = (mcp_kreq_ether_send_t *) ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL); /* allocate the tx busdma segment list */ bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc; ss->tx.seg_list = (bus_dma_segment_t *) malloc(bytes, M_DEVBUF, M_WAITOK); /* allocate the tx host info ring */ bytes = tx_ring_entries * sizeof (*ss->tx.info); ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); /* allocate the tx busdma resources */ err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 1, /* alignment */ sc->tx_boundary, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ 65536 + 256, /* maxsize */ ss->tx.max_desc - 2, /* num segs */ sc->tx_boundary, /* maxsegsz */ BUS_DMA_ALLOCNOW, /* flags */ NULL, NULL, /* lock */ &ss->tx.dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating tx dmat\n", err); return err; } /* now use these tags to setup dmamaps for each slot in the ring */ for (i = 0; i <= ss->tx.mask; i++) { err = bus_dmamap_create(ss->tx.dmat, 0, &ss->tx.info[i].map); if (err != 0) { device_printf(sc->dev, "Err %d tx dmamap\n", err); return err; } } return 0; } static int mxge_alloc_rings(mxge_softc_t *sc) { mxge_cmd_t cmd; int tx_ring_size; int tx_ring_entries, rx_ring_entries; int err, slice; /* get ring sizes */ err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd); tx_ring_size = cmd.data0; if (err != 0) { device_printf(sc->dev, "Cannot determine tx ring sizes\n"); goto abort; } tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t); rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t); IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1); sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen; IFQ_SET_READY(&sc->ifp->if_snd); for (slice = 0; slice < sc->num_slices; slice++) { err = mxge_alloc_slice_rings(&sc->ss[slice], rx_ring_entries, tx_ring_entries); if (err != 0) goto abort; } return 0; abort: mxge_free_rings(sc); return err; } static void mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs) { int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD; if (bufsize < MCLBYTES) { /* easy, everything fits in a single buffer */ *big_buf_size = MCLBYTES; *cl_size = MCLBYTES; *nbufs = 1; return; } if (bufsize < MJUMPAGESIZE) { /* still easy, everything still fits in a single buffer */ *big_buf_size = MJUMPAGESIZE; *cl_size = MJUMPAGESIZE; *nbufs = 1; return; } #if MXGE_VIRT_JUMBOS /* now we need to use virtually contiguous buffers */ *cl_size = MJUM9BYTES; *big_buf_size = 4096; *nbufs = mtu / 4096 + 1; /* needs to be a power of two, so round up */ if (*nbufs == 3) *nbufs = 4; #else *cl_size = MJUM9BYTES; *big_buf_size = MJUM9BYTES; *nbufs = 1; #endif } static int mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size) { mxge_softc_t *sc; mxge_cmd_t cmd; bus_dmamap_t map; int err, i, slice; sc = ss->sc; slice = ss - sc->ss; #if defined(INET) || defined(INET6) (void)tcp_lro_init(&ss->lc); #endif ss->lc.ifp = sc->ifp; /* get the lanai pointers to the send and receive rings */ err = 0; #ifndef IFNET_BUF_RING /* We currently only send from the first slice */ if (slice == 0) { #endif cmd.data0 = slice; err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd); ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0); ss->tx.send_go = (volatile uint32_t *) (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice); ss->tx.send_stop = (volatile uint32_t *) (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice); #ifndef IFNET_BUF_RING } #endif cmd.data0 = slice; err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd); ss->rx_small.lanai = (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0); cmd.data0 = slice; err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd); ss->rx_big.lanai = (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0); if (err != 0) { device_printf(sc->dev, "failed to get ring sizes or locations\n"); return EIO; } /* stock receive rings */ for (i = 0; i <= ss->rx_small.mask; i++) { map = ss->rx_small.info[i].map; err = mxge_get_buf_small(ss, map, i); if (err) { device_printf(sc->dev, "alloced %d/%d smalls\n", i, ss->rx_small.mask + 1); return ENOMEM; } } for (i = 0; i <= ss->rx_big.mask; i++) { ss->rx_big.shadow[i].addr_low = 0xffffffff; ss->rx_big.shadow[i].addr_high = 0xffffffff; } ss->rx_big.nbufs = nbufs; ss->rx_big.cl_size = cl_size; ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD; for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) { map = ss->rx_big.info[i].map; err = mxge_get_buf_big(ss, map, i); if (err) { device_printf(sc->dev, "alloced %d/%d bigs\n", i, ss->rx_big.mask + 1); return ENOMEM; } } return 0; } static int mxge_open(mxge_softc_t *sc) { mxge_cmd_t cmd; int err, big_bytes, nbufs, slice, cl_size, i; bus_addr_t bus; volatile uint8_t *itable; struct mxge_slice_state *ss; /* Copy the MAC address in case it was overridden */ bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN); err = mxge_reset(sc, 1); if (err != 0) { device_printf(sc->dev, "failed to reset\n"); return EIO; } if (sc->num_slices > 1) { /* setup the indirection table */ cmd.data0 = sc->num_slices; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd); err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd); if (err != 0) { device_printf(sc->dev, "failed to setup rss tables\n"); return err; } /* just enable an identity mapping */ itable = sc->sram + cmd.data0; for (i = 0; i < sc->num_slices; i++) itable[i] = (uint8_t)i; cmd.data0 = 1; cmd.data1 = mxge_rss_hash_type; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd); if (err != 0) { device_printf(sc->dev, "failed to enable slices\n"); return err; } } mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs); cmd.data0 = nbufs; err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd); /* error is only meaningful if we're trying to set MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */ if (err && nbufs > 1) { device_printf(sc->dev, "Failed to set alway-use-n to %d\n", nbufs); return EIO; } /* Give the firmware the mtu and the big and small buffer sizes. The firmware wants the big buf size to be a power of two. Luckily, FreeBSD's clusters are powers of two */ cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd); cmd.data0 = MHLEN - MXGEFW_PAD; err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd); cmd.data0 = big_bytes; err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd); if (err != 0) { device_printf(sc->dev, "failed to setup params\n"); goto abort; } /* Now give him the pointer to the stats block */ for (slice = 0; #ifdef IFNET_BUF_RING slice < sc->num_slices; #else slice < 1; #endif slice++) { ss = &sc->ss[slice]; cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr); cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr); cmd.data2 = sizeof(struct mcp_irq_data); cmd.data2 |= (slice << 16); err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd); } if (err != 0) { bus = sc->ss->fw_stats_dma.bus_addr; bus += offsetof(struct mcp_irq_data, send_done_count); cmd.data0 = MXGE_LOWPART_TO_U32(bus); cmd.data1 = MXGE_HIGHPART_TO_U32(bus); err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd); /* Firmware cannot support multicast without STATS_DMA_V2 */ sc->fw_multicast_support = 0; } else { sc->fw_multicast_support = 1; } if (err != 0) { device_printf(sc->dev, "failed to setup params\n"); goto abort; } for (slice = 0; slice < sc->num_slices; slice++) { err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size); if (err != 0) { device_printf(sc->dev, "couldn't open slice %d\n", slice); goto abort; } } /* Finally, start the firmware running */ err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd); if (err) { device_printf(sc->dev, "Couldn't bring up link\n"); goto abort; } #ifdef IFNET_BUF_RING for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; ss->if_drv_flags |= IFF_DRV_RUNNING; ss->if_drv_flags &= ~IFF_DRV_OACTIVE; } #endif sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; abort: mxge_free_mbufs(sc); return err; } static int mxge_close(mxge_softc_t *sc, int down) { mxge_cmd_t cmd; int err, old_down_cnt; #ifdef IFNET_BUF_RING struct mxge_slice_state *ss; int slice; #endif #ifdef IFNET_BUF_RING for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; ss->if_drv_flags &= ~IFF_DRV_RUNNING; } #endif sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if (!down) { old_down_cnt = sc->down_cnt; wmb(); err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd); if (err) { device_printf(sc->dev, "Couldn't bring down link\n"); } if (old_down_cnt == sc->down_cnt) { /* wait for down irq */ DELAY(10 * sc->intr_coal_delay); } wmb(); if (old_down_cnt == sc->down_cnt) { device_printf(sc->dev, "never got down irq\n"); } } mxge_free_mbufs(sc); return 0; } static void mxge_setup_cfg_space(mxge_softc_t *sc) { device_t dev = sc->dev; int reg; uint16_t lnk, pectl; /* find the PCIe link width and set max read request to 4KB*/ if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) { lnk = pci_read_config(dev, reg + 0x12, 2); sc->link_width = (lnk >> 4) & 0x3f; if (sc->pectl == 0) { pectl = pci_read_config(dev, reg + 0x8, 2); pectl = (pectl & ~0x7000) | (5 << 12); pci_write_config(dev, reg + 0x8, pectl, 2); sc->pectl = pectl; } else { /* restore saved pectl after watchdog reset */ pci_write_config(dev, reg + 0x8, sc->pectl, 2); } } /* Enable DMA and Memory space access */ pci_enable_busmaster(dev); } static uint32_t mxge_read_reboot(mxge_softc_t *sc) { device_t dev = sc->dev; uint32_t vs; /* find the vendor specific offset */ if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) { device_printf(sc->dev, "could not find vendor specific offset\n"); return (uint32_t)-1; } /* enable read32 mode */ pci_write_config(dev, vs + 0x10, 0x3, 1); /* tell NIC which register to read */ pci_write_config(dev, vs + 0x18, 0xfffffff0, 4); return (pci_read_config(dev, vs + 0x14, 4)); } static void mxge_watchdog_reset(mxge_softc_t *sc) { struct pci_devinfo *dinfo; struct mxge_slice_state *ss; int err, running, s, num_tx_slices = 1; uint32_t reboot; uint16_t cmd; err = ENXIO; device_printf(sc->dev, "Watchdog reset!\n"); /* * check to see if the NIC rebooted. If it did, then all of * PCI config space has been reset, and things like the * busmaster bit will be zero. If this is the case, then we * must restore PCI config space before the NIC can be used * again */ cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); if (cmd == 0xffff) { /* * maybe the watchdog caught the NIC rebooting; wait * up to 100ms for it to finish. If it does not come * back, then give up */ DELAY(1000*100); cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); if (cmd == 0xffff) { device_printf(sc->dev, "NIC disappeared!\n"); } } if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) { /* print the reboot status */ reboot = mxge_read_reboot(sc); device_printf(sc->dev, "NIC rebooted, status = 0x%x\n", reboot); running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING; if (running) { /* * quiesce NIC so that TX routines will not try to * xmit after restoration of BAR */ /* Mark the link as down */ if (sc->link_state) { sc->link_state = 0; if_link_state_change(sc->ifp, LINK_STATE_DOWN); } #ifdef IFNET_BUF_RING num_tx_slices = sc->num_slices; #endif /* grab all TX locks to ensure no tx */ for (s = 0; s < num_tx_slices; s++) { ss = &sc->ss[s]; mtx_lock(&ss->tx.mtx); } mxge_close(sc, 1); } /* restore PCI configuration space */ dinfo = device_get_ivars(sc->dev); pci_cfg_restore(sc->dev, dinfo); /* and redo any changes we made to our config space */ mxge_setup_cfg_space(sc); /* reload f/w */ err = mxge_load_firmware(sc, 0); if (err) { device_printf(sc->dev, "Unable to re-load f/w\n"); } if (running) { if (!err) err = mxge_open(sc); /* release all TX locks */ for (s = 0; s < num_tx_slices; s++) { ss = &sc->ss[s]; #ifdef IFNET_BUF_RING mxge_start_locked(ss); #endif mtx_unlock(&ss->tx.mtx); } } sc->watchdog_resets++; } else { device_printf(sc->dev, "NIC did not reboot, not resetting\n"); err = 0; } if (err) { device_printf(sc->dev, "watchdog reset failed\n"); } else { if (sc->dying == 2) sc->dying = 0; callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc); } } static void mxge_watchdog_task(void *arg, int pending) { mxge_softc_t *sc = arg; mtx_lock(&sc->driver_mtx); mxge_watchdog_reset(sc); mtx_unlock(&sc->driver_mtx); } static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice) { tx = &sc->ss[slice].tx; device_printf(sc->dev, "slice %d struck? ring state:\n", slice); device_printf(sc->dev, "tx.req=%d tx.done=%d, tx.queue_active=%d\n", tx->req, tx->done, tx->queue_active); device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n", tx->activate, tx->deactivate); device_printf(sc->dev, "pkt_done=%d fw=%d\n", tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count)); } static int mxge_watchdog(mxge_softc_t *sc) { mxge_tx_ring_t *tx; uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause); int i, err = 0; /* see if we have outstanding transmits, which have been pending for more than mxge_ticks */ for (i = 0; #ifdef IFNET_BUF_RING (i < sc->num_slices) && (err == 0); #else (i < 1) && (err == 0); #endif i++) { tx = &sc->ss[i].tx; if (tx->req != tx->done && tx->watchdog_req != tx->watchdog_done && tx->done == tx->watchdog_done) { /* check for pause blocking before resetting */ if (tx->watchdog_rx_pause == rx_pause) { mxge_warn_stuck(sc, tx, i); taskqueue_enqueue(sc->tq, &sc->watchdog_task); return (ENXIO); } else device_printf(sc->dev, "Flow control blocking " "xmits, check link partner\n"); } tx->watchdog_req = tx->req; tx->watchdog_done = tx->done; tx->watchdog_rx_pause = rx_pause; } if (sc->need_media_probe) mxge_media_probe(sc); return (err); } static uint64_t mxge_get_counter(struct ifnet *ifp, ift_counter cnt) { struct mxge_softc *sc; uint64_t rv; sc = if_getsoftc(ifp); rv = 0; switch (cnt) { case IFCOUNTER_IPACKETS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].ipackets; return (rv); case IFCOUNTER_OPACKETS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].opackets; return (rv); case IFCOUNTER_OERRORS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].oerrors; return (rv); #ifdef IFNET_BUF_RING case IFCOUNTER_OBYTES: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].obytes; return (rv); case IFCOUNTER_OMCASTS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].omcasts; return (rv); case IFCOUNTER_OQDROPS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].tx.br->br_drops; return (rv); #endif default: return (if_get_counter_default(ifp, cnt)); } } static void mxge_tick(void *arg) { mxge_softc_t *sc = arg; u_long pkts = 0; int err = 0; int running, ticks; uint16_t cmd; ticks = mxge_ticks; running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING; if (running) { if (!sc->watchdog_countdown) { err = mxge_watchdog(sc); sc->watchdog_countdown = 4; } sc->watchdog_countdown--; } if (pkts == 0) { /* ensure NIC did not suffer h/w fault while idle */ cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) { sc->dying = 2; taskqueue_enqueue(sc->tq, &sc->watchdog_task); err = ENXIO; } /* look less often if NIC is idle */ ticks *= 4; } if (err == 0) callout_reset(&sc->co_hdl, ticks, mxge_tick, sc); } static int mxge_media_change(struct ifnet *ifp) { return EINVAL; } static int mxge_change_mtu(mxge_softc_t *sc, int mtu) { struct ifnet *ifp = sc->ifp; int real_mtu, old_mtu; int err = 0; real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; if ((real_mtu > sc->max_mtu) || real_mtu < 60) return EINVAL; mtx_lock(&sc->driver_mtx); old_mtu = ifp->if_mtu; ifp->if_mtu = mtu; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mxge_close(sc, 0); err = mxge_open(sc); if (err != 0) { ifp->if_mtu = old_mtu; mxge_close(sc, 0); (void) mxge_open(sc); } } mtx_unlock(&sc->driver_mtx); return err; } static void mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { mxge_softc_t *sc = ifp->if_softc; if (sc == NULL) return; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER | IFM_FDX; ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0; ifmr->ifm_active |= sc->current_media; } static int mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { mxge_softc_t *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int err, mask; err = 0; switch (command) { case SIOCSIFADDR: case SIOCGIFADDR: err = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: err = mxge_change_mtu(sc, ifr->ifr_mtu); break; case SIOCSIFFLAGS: mtx_lock(&sc->driver_mtx); if (sc->dying) { mtx_unlock(&sc->driver_mtx); return EINVAL; } if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { err = mxge_open(sc); } else { /* take care of promis can allmulti flag chages */ mxge_change_promisc(sc, ifp->if_flags & IFF_PROMISC); mxge_set_multicast_list(sc); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mxge_close(sc, 0); } } mtx_unlock(&sc->driver_mtx); break; case SIOCADDMULTI: case SIOCDELMULTI: mtx_lock(&sc->driver_mtx); mxge_set_multicast_list(sc); mtx_unlock(&sc->driver_mtx); break; case SIOCSIFCAP: mtx_lock(&sc->driver_mtx); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP); } else { ifp->if_capenable |= IFCAP_TXCSUM; ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); } } else if (mask & IFCAP_RXCSUM) { if (IFCAP_RXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_RXCSUM; } else { ifp->if_capenable |= IFCAP_RXCSUM; } } if (mask & IFCAP_TSO4) { if (IFCAP_TSO4 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_TSO4; } else if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_TSO; } else { printf("mxge requires tx checksum offload" " be enabled to use TSO\n"); err = EINVAL; } } #if IFCAP_TSO6 if (mask & IFCAP_TXCSUM_IPV6) { if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) { ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); ifp->if_hwassist &= ~(CSUM_TCP_IPV6 | CSUM_UDP); } else { ifp->if_capenable |= IFCAP_TXCSUM_IPV6; ifp->if_hwassist |= (CSUM_TCP_IPV6 | CSUM_UDP_IPV6); } } else if (mask & IFCAP_RXCSUM_IPV6) { if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6; } else { ifp->if_capenable |= IFCAP_RXCSUM_IPV6; } } if (mask & IFCAP_TSO6) { if (IFCAP_TSO6 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_TSO6; } else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) { ifp->if_capenable |= IFCAP_TSO6; ifp->if_hwassist |= CSUM_TSO; } else { printf("mxge requires tx checksum offload" " be enabled to use TSO\n"); err = EINVAL; } } #endif /*IFCAP_TSO6 */ if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) || !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING)) ifp->if_capenable &= ~IFCAP_VLAN_HWTSO; mtx_unlock(&sc->driver_mtx); VLAN_CAPABILITIES(ifp); break; case SIOCGIFMEDIA: mtx_lock(&sc->driver_mtx); mxge_media_probe(sc); mtx_unlock(&sc->driver_mtx); err = ifmedia_ioctl(ifp, (struct ifreq *)data, &sc->media, command); break; default: err = ENOTTY; } return err; } static void mxge_fetch_tunables(mxge_softc_t *sc) { TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices); TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled", &mxge_flow_control); TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay); TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable); TUNABLE_INT_FETCH("hw.mxge.force_firmware", &mxge_force_firmware); TUNABLE_INT_FETCH("hw.mxge.deassert_wait", &mxge_deassert_wait); TUNABLE_INT_FETCH("hw.mxge.verbose", &mxge_verbose); TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks); TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc); TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type); TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type); TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu); TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle); if (bootverbose) mxge_verbose = 1; if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000) mxge_intr_coal_delay = 30; if (mxge_ticks == 0) mxge_ticks = hz / 2; sc->pause = mxge_flow_control; if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) { mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT; } if (mxge_initial_mtu > ETHERMTU_JUMBO || mxge_initial_mtu < ETHER_MIN_LEN) mxge_initial_mtu = ETHERMTU_JUMBO; if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE) mxge_throttle = MXGE_MAX_THROTTLE; if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE) mxge_throttle = MXGE_MIN_THROTTLE; sc->throttle = mxge_throttle; } static void mxge_free_slices(mxge_softc_t *sc) { struct mxge_slice_state *ss; int i; if (sc->ss == NULL) return; for (i = 0; i < sc->num_slices; i++) { ss = &sc->ss[i]; if (ss->fw_stats != NULL) { mxge_dma_free(&ss->fw_stats_dma); ss->fw_stats = NULL; #ifdef IFNET_BUF_RING if (ss->tx.br != NULL) { drbr_free(ss->tx.br, M_DEVBUF); ss->tx.br = NULL; } #endif mtx_destroy(&ss->tx.mtx); } if (ss->rx_done.entry != NULL) { mxge_dma_free(&ss->rx_done.dma); ss->rx_done.entry = NULL; } } free(sc->ss, M_DEVBUF); sc->ss = NULL; } static int mxge_alloc_slices(mxge_softc_t *sc) { mxge_cmd_t cmd; struct mxge_slice_state *ss; size_t bytes; int err, i, max_intr_slots; err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd); if (err != 0) { device_printf(sc->dev, "Cannot determine rx ring size\n"); return err; } sc->rx_ring_size = cmd.data0; max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t)); bytes = sizeof (*sc->ss) * sc->num_slices; sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->ss == NULL) return (ENOMEM); for (i = 0; i < sc->num_slices; i++) { ss = &sc->ss[i]; ss->sc = sc; /* allocate per-slice rx interrupt queues */ bytes = max_intr_slots * sizeof (*ss->rx_done.entry); err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096); if (err != 0) goto abort; ss->rx_done.entry = ss->rx_done.dma.addr; bzero(ss->rx_done.entry, bytes); /* * allocate the per-slice firmware stats; stats * (including tx) are used used only on the first * slice for now */ #ifndef IFNET_BUF_RING if (i > 0) continue; #endif bytes = sizeof (*ss->fw_stats); err = mxge_dma_alloc(sc, &ss->fw_stats_dma, sizeof (*ss->fw_stats), 64); if (err != 0) goto abort; ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr; snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name), "%s:tx(%d)", device_get_nameunit(sc->dev), i); mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF); #ifdef IFNET_BUF_RING ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &ss->tx.mtx); #endif } return (0); abort: mxge_free_slices(sc); return (ENOMEM); } static void mxge_slice_probe(mxge_softc_t *sc) { mxge_cmd_t cmd; char *old_fw; int msix_cnt, status, max_intr_slots; sc->num_slices = 1; /* * don't enable multiple slices if they are not enabled, * or if this is not an SMP system */ if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2) return; /* see how many MSI-X interrupts are available */ msix_cnt = pci_msix_count(sc->dev); if (msix_cnt < 2) return; /* now load the slice aware firmware see what it supports */ old_fw = sc->fw_name; if (old_fw == mxge_fw_aligned) sc->fw_name = mxge_fw_rss_aligned; else sc->fw_name = mxge_fw_rss_unaligned; status = mxge_load_firmware(sc, 0); if (status != 0) { device_printf(sc->dev, "Falling back to a single slice\n"); return; } /* try to send a reset command to the card to see if it is alive */ memset(&cmd, 0, sizeof (cmd)); status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd); if (status != 0) { device_printf(sc->dev, "failed reset\n"); goto abort_with_fw; } /* get rx ring size */ status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd); if (status != 0) { device_printf(sc->dev, "Cannot determine rx ring size\n"); goto abort_with_fw; } max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t)); /* tell it the size of the interrupt queues */ cmd.data0 = max_intr_slots * sizeof (struct mcp_slot); status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd); if (status != 0) { device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n"); goto abort_with_fw; } /* ask the maximum number of slices it supports */ status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd); if (status != 0) { device_printf(sc->dev, "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n"); goto abort_with_fw; } sc->num_slices = cmd.data0; if (sc->num_slices > msix_cnt) sc->num_slices = msix_cnt; if (mxge_max_slices == -1) { /* cap to number of CPUs in system */ if (sc->num_slices > mp_ncpus) sc->num_slices = mp_ncpus; } else { if (sc->num_slices > mxge_max_slices) sc->num_slices = mxge_max_slices; } /* make sure it is a power of two */ while (sc->num_slices & (sc->num_slices - 1)) sc->num_slices--; if (mxge_verbose) device_printf(sc->dev, "using %d slices\n", sc->num_slices); return; abort_with_fw: sc->fw_name = old_fw; (void) mxge_load_firmware(sc, 0); } static int mxge_add_msix_irqs(mxge_softc_t *sc) { size_t bytes; int count, err, i, rid; rid = PCIR_BAR(2); sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->msix_table_res == NULL) { device_printf(sc->dev, "couldn't alloc MSIX table res\n"); return ENXIO; } count = sc->num_slices; err = pci_alloc_msix(sc->dev, &count); if (err != 0) { device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d" "err = %d \n", sc->num_slices, err); goto abort_with_msix_table; } if (count < sc->num_slices) { device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n", count, sc->num_slices); device_printf(sc->dev, "Try setting hw.mxge.max_slices to %d\n", count); err = ENOSPC; goto abort_with_msix; } bytes = sizeof (*sc->msix_irq_res) * sc->num_slices; sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO); if (sc->msix_irq_res == NULL) { err = ENOMEM; goto abort_with_msix; } for (i = 0; i < sc->num_slices; i++) { rid = i + 1; sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (sc->msix_irq_res[i] == NULL) { device_printf(sc->dev, "couldn't allocate IRQ res" " for message %d\n", i); err = ENXIO; goto abort_with_res; } } bytes = sizeof (*sc->msix_ih) * sc->num_slices; sc->msix_ih = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO); for (i = 0; i < sc->num_slices; i++) { err = bus_setup_intr(sc->dev, sc->msix_irq_res[i], INTR_TYPE_NET | INTR_MPSAFE, #if __FreeBSD_version > 700030 NULL, #endif mxge_intr, &sc->ss[i], &sc->msix_ih[i]); if (err != 0) { device_printf(sc->dev, "couldn't setup intr for " "message %d\n", i); goto abort_with_intr; } bus_describe_intr(sc->dev, sc->msix_irq_res[i], sc->msix_ih[i], "s%d", i); } if (mxge_verbose) { device_printf(sc->dev, "using %d msix IRQs:", sc->num_slices); for (i = 0; i < sc->num_slices; i++) printf(" %ld", rman_get_start(sc->msix_irq_res[i])); printf("\n"); } return (0); abort_with_intr: for (i = 0; i < sc->num_slices; i++) { if (sc->msix_ih[i] != NULL) { bus_teardown_intr(sc->dev, sc->msix_irq_res[i], sc->msix_ih[i]); sc->msix_ih[i] = NULL; } } free(sc->msix_ih, M_DEVBUF); abort_with_res: for (i = 0; i < sc->num_slices; i++) { rid = i + 1; if (sc->msix_irq_res[i] != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, rid, sc->msix_irq_res[i]); sc->msix_irq_res[i] = NULL; } free(sc->msix_irq_res, M_DEVBUF); abort_with_msix: pci_release_msi(sc->dev); abort_with_msix_table: bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2), sc->msix_table_res); return err; } static int mxge_add_single_irq(mxge_softc_t *sc) { int count, err, rid; count = pci_msi_count(sc->dev); if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) { rid = 1; } else { rid = 0; sc->legacy_irq = 1; } sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0, 1, RF_SHAREABLE | RF_ACTIVE); if (sc->irq_res == NULL) { device_printf(sc->dev, "could not alloc interrupt\n"); return ENXIO; } if (mxge_verbose) device_printf(sc->dev, "using %s irq %ld\n", sc->legacy_irq ? "INTx" : "MSI", rman_get_start(sc->irq_res)); err = bus_setup_intr(sc->dev, sc->irq_res, INTR_TYPE_NET | INTR_MPSAFE, #if __FreeBSD_version > 700030 NULL, #endif mxge_intr, &sc->ss[0], &sc->ih); if (err != 0) { bus_release_resource(sc->dev, SYS_RES_IRQ, sc->legacy_irq ? 0 : 1, sc->irq_res); if (!sc->legacy_irq) pci_release_msi(sc->dev); } return err; } static void mxge_rem_msix_irqs(mxge_softc_t *sc) { int i, rid; for (i = 0; i < sc->num_slices; i++) { if (sc->msix_ih[i] != NULL) { bus_teardown_intr(sc->dev, sc->msix_irq_res[i], sc->msix_ih[i]); sc->msix_ih[i] = NULL; } } free(sc->msix_ih, M_DEVBUF); for (i = 0; i < sc->num_slices; i++) { rid = i + 1; if (sc->msix_irq_res[i] != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, rid, sc->msix_irq_res[i]); sc->msix_irq_res[i] = NULL; } free(sc->msix_irq_res, M_DEVBUF); bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2), sc->msix_table_res); pci_release_msi(sc->dev); return; } static void mxge_rem_single_irq(mxge_softc_t *sc) { bus_teardown_intr(sc->dev, sc->irq_res, sc->ih); bus_release_resource(sc->dev, SYS_RES_IRQ, sc->legacy_irq ? 0 : 1, sc->irq_res); if (!sc->legacy_irq) pci_release_msi(sc->dev); } static void mxge_rem_irq(mxge_softc_t *sc) { if (sc->num_slices > 1) mxge_rem_msix_irqs(sc); else mxge_rem_single_irq(sc); } static int mxge_add_irq(mxge_softc_t *sc) { int err; if (sc->num_slices > 1) err = mxge_add_msix_irqs(sc); else err = mxge_add_single_irq(sc); if (0 && err == 0 && sc->num_slices > 1) { mxge_rem_msix_irqs(sc); err = mxge_add_msix_irqs(sc); } return err; } static int mxge_attach(device_t dev) { mxge_cmd_t cmd; mxge_softc_t *sc = device_get_softc(dev); struct ifnet *ifp; int err, rid; sc->dev = dev; mxge_fetch_tunables(sc); TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc); sc->tq = taskqueue_create("mxge_taskq", M_WAITOK, taskqueue_thread_enqueue, &sc->tq); if (sc->tq == NULL) { err = ENOMEM; goto abort_with_nothing; } err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, /* alignment */ 0, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ 65536 + 256, /* maxsize */ MXGE_MAX_SEND_DESC, /* num segs */ 65536, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lock */ &sc->parent_dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating parent dmat\n", err); goto abort_with_tq; } ifp = sc->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not if_alloc()\n"); err = ENOSPC; goto abort_with_parent_dmat; } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd", device_get_nameunit(dev)); mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF); snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name), "%s:drv", device_get_nameunit(dev)); mtx_init(&sc->driver_mtx, sc->driver_mtx_name, MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0); mxge_setup_cfg_space(sc); /* Map the board into the kernel */ rid = PCIR_BARS; sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, ~0, 1, RF_ACTIVE); if (sc->mem_res == NULL) { device_printf(dev, "could not map memory\n"); err = ENXIO; goto abort_with_lock; } sc->sram = rman_get_virtual(sc->mem_res); sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100; if (sc->sram_size > rman_get_size(sc->mem_res)) { device_printf(dev, "impossible memory region size %ld\n", rman_get_size(sc->mem_res)); err = ENXIO; goto abort_with_mem_res; } /* make NULL terminated copy of the EEPROM strings section of lanai SRAM */ bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE); bus_space_read_region_1(rman_get_bustag(sc->mem_res), rman_get_bushandle(sc->mem_res), sc->sram_size - MXGE_EEPROM_STRINGS_SIZE, sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2); err = mxge_parse_strings(sc); if (err != 0) goto abort_with_mem_res; /* Enable write combining for efficient use of PCIe bus */ mxge_enable_wc(sc); /* Allocate the out of band dma memory */ err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof (mxge_cmd_t), 64); if (err != 0) goto abort_with_mem_res; sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr; err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64); if (err != 0) goto abort_with_cmd_dma; err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096); if (err != 0) goto abort_with_zeropad_dma; /* select & load the firmware */ err = mxge_select_firmware(sc); if (err != 0) goto abort_with_dmabench; sc->intr_coal_delay = mxge_intr_coal_delay; mxge_slice_probe(sc); err = mxge_alloc_slices(sc); if (err != 0) goto abort_with_dmabench; err = mxge_reset(sc, 0); if (err != 0) goto abort_with_slices; err = mxge_alloc_rings(sc); if (err != 0) { device_printf(sc->dev, "failed to allocate rings\n"); goto abort_with_slices; } err = mxge_add_irq(sc); if (err != 0) { device_printf(sc->dev, "failed to add irq\n"); goto abort_with_rings; } ifp->if_baudrate = IF_Gbps(10); ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 | IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6; #if defined(INET) || defined(INET6) ifp->if_capabilities |= IFCAP_LRO; #endif #ifdef MXGE_NEW_VLAN_API ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; /* Only FW 1.4.32 and newer can do TSO over vlans */ if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 && sc->fw_ver_tiny >= 32) ifp->if_capabilities |= IFCAP_VLAN_HWTSO; #endif sc->max_mtu = mxge_max_mtu(sc); if (sc->max_mtu >= 9000) ifp->if_capabilities |= IFCAP_JUMBO_MTU; else device_printf(dev, "MTU limited to %d. Install " "latest firmware for 9000 byte jumbo support\n", sc->max_mtu - ETHER_HDR_LEN); ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO; ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6; /* check to see if f/w supports TSO for IPv6 */ if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) { if (CSUM_TCP_IPV6) ifp->if_capabilities |= IFCAP_TSO6; sc->max_tso6_hlen = min(cmd.data0, sizeof (sc->ss[0].scratch)); } ifp->if_capenable = ifp->if_capabilities; if (sc->lro_cnt == 0) ifp->if_capenable &= ~IFCAP_LRO; ifp->if_init = mxge_init; ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = mxge_ioctl; ifp->if_start = mxge_start; ifp->if_get_counter = mxge_get_counter; /* Initialise the ifmedia structure */ ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status); mxge_media_init(sc); mxge_media_probe(sc); sc->dying = 0; ether_ifattach(ifp, sc->mac_addr); /* ether_ifattach sets mtu to ETHERMTU */ if (mxge_initial_mtu != ETHERMTU) mxge_change_mtu(sc, mxge_initial_mtu); mxge_add_sysctls(sc); #ifdef IFNET_BUF_RING ifp->if_transmit = mxge_transmit; ifp->if_qflush = mxge_qflush; #endif taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq", device_get_nameunit(sc->dev)); callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc); return 0; abort_with_rings: mxge_free_rings(sc); abort_with_slices: mxge_free_slices(sc); abort_with_dmabench: mxge_dma_free(&sc->dmabench_dma); abort_with_zeropad_dma: mxge_dma_free(&sc->zeropad_dma); abort_with_cmd_dma: mxge_dma_free(&sc->cmd_dma); abort_with_mem_res: bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res); abort_with_lock: pci_disable_busmaster(dev); mtx_destroy(&sc->cmd_mtx); mtx_destroy(&sc->driver_mtx); if_free(ifp); abort_with_parent_dmat: bus_dma_tag_destroy(sc->parent_dmat); abort_with_tq: if (sc->tq != NULL) { taskqueue_drain(sc->tq, &sc->watchdog_task); taskqueue_free(sc->tq); sc->tq = NULL; } abort_with_nothing: return err; } static int mxge_detach(device_t dev) { mxge_softc_t *sc = device_get_softc(dev); if (mxge_vlans_active(sc)) { device_printf(sc->dev, "Detach vlans before removing module\n"); return EBUSY; } mtx_lock(&sc->driver_mtx); sc->dying = 1; if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) mxge_close(sc, 0); mtx_unlock(&sc->driver_mtx); ether_ifdetach(sc->ifp); if (sc->tq != NULL) { taskqueue_drain(sc->tq, &sc->watchdog_task); taskqueue_free(sc->tq); sc->tq = NULL; } callout_drain(&sc->co_hdl); ifmedia_removeall(&sc->media); mxge_dummy_rdma(sc, 0); mxge_rem_sysctls(sc); mxge_rem_irq(sc); mxge_free_rings(sc); mxge_free_slices(sc); mxge_dma_free(&sc->dmabench_dma); mxge_dma_free(&sc->zeropad_dma); mxge_dma_free(&sc->cmd_dma); bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res); pci_disable_busmaster(dev); mtx_destroy(&sc->cmd_mtx); mtx_destroy(&sc->driver_mtx); if_free(sc->ifp); bus_dma_tag_destroy(sc->parent_dmat); return 0; } static int mxge_shutdown(device_t dev) { return 0; } /* This file uses Myri10GE driver indentation. Local Variables: c-file-style:"linux" tab-width:8 End: */ Index: head/sys/dev/netmap/netmap_freebsd.c =================================================================== --- head/sys/dev/netmap/netmap_freebsd.c (revision 275357) +++ head/sys/dev/netmap/netmap_freebsd.c (revision 275358) @@ -1,832 +1,832 @@ /* * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* $FreeBSD$ */ #include #include #include #include /* defines used in kernel.h */ #include /* POLLIN, POLLOUT */ #include /* types used in module initialization */ #include /* DEV_MODULE */ #include #include #include /* vtophys */ #include /* vtophys */ #include #include #include #include #include #include #include /* sockaddrs */ #include #include #include #include /* IFT_ETHER */ #include /* ether_ifdetach */ #include /* LLADDR */ #include /* bus_dmamap_* */ #include /* in6_cksum_pseudo() */ #include /* in_pseudo(), in_cksum_hdr() */ #include #include #include /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ uint16_t *words = (uint16_t *)data; int nw = len / 2; int i; for (i = 0; i < nw; i++) cur_sum += be16toh(words[i]); if (len & 1) cur_sum += (data[len-1] << 8); return cur_sum; } /* Fold a raw checksum: 'cur_sum' is in host byte order, while the * return value is in network byte order. */ uint16_t nm_csum_fold(rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ while (cur_sum >> 16) cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); return htobe16((~cur_sum) & 0xFFFF); } uint16_t nm_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); #else return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); #endif } void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check) { #ifdef INET uint16_t pseudolen = datalen + iph->protocol; /* Compute and insert the pseudo-header cheksum. */ *check = in_pseudo(iph->saddr, iph->daddr, htobe16(pseudolen)); /* Compute the checksum on TCP/UDP header + payload * (includes the pseudo-header). */ *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; D("inet4 segmentation not supported"); } #endif } void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check) { #ifdef INET6 *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; D("inet6 segmentation not supported"); } #endif } /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore */ int netmap_catch_rx(struct netmap_adapter *na, int intercept) { struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; struct ifnet *ifp = na->ifp; if (intercept) { if (gna->save_if_input) { D("cannot intercept again"); return EINVAL; /* already set */ } gna->save_if_input = ifp->if_input; ifp->if_input = generic_rx_handler; } else { if (!gna->save_if_input){ D("cannot restore"); return EINVAL; /* not saved */ } ifp->if_input = gna->save_if_input; gna->save_if_input = NULL; } return 0; } /* * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. * On freebsd we just intercept if_transmit. */ void netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = na->ifp; if (enable) { na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; } else { ifp->if_transmit = na->if_transmit; } } /* * Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. * * We should add a reference to the mbuf so the m_freem() at the end * of the transmission does not consume resources. * * On FreeBSD, and on multiqueue cards, we can force the queue using - * if ((m->m_flags & M_FLOWID) != 0) + * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) * i = m->m_pkthdr.flowid % adapter->num_queues; * else * i = curcpu % adapter->num_queues; * */ int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr) { int ret; /* * The mbuf should be a cluster from our special pool, * so we do not need to do an m_copyback but just copy * (and eventually, just reference the netmap buffer) */ if (GET_MBUF_REFCNT(m) != 1) { D("invalid refcnt %d for %p", GET_MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } // XXX the ext_size check is unnecessary if we link the netmap buf if (m->m_ext.ext_size < len) { RD(5, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } if (0) { /* XXX seems to have negligible benefits */ m->m_ext.ext_buf = m->m_data = addr; } else { bcopy(addr, m->m_data, len); } m->m_len = m->m_pkthdr.len = len; // inc refcount. All ours, we could skip the atomic atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1); - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ ret = NA(ifp)->if_transmit(ifp, m); return ret; } #if __FreeBSD_version >= 1100005 struct netmap_adapter * netmap_getna(if_t ifp) { return (NA((struct ifnet *)ifp)); } #endif /* __FreeBSD_version >= 1100005 */ /* * The following two functions are empty until we have a generic * way to extract the info from the ifp */ int generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) { D("called, in tx %d rx %d", *tx, *rx); return 0; } void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { D("called, in txq %d rxq %d", *txq, *rxq); *txq = netmap_generic_rings; *rxq = netmap_generic_rings; } void netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { ND("called"); mit->mit_pending = 0; mit->mit_ring_idx = idx; mit->mit_na = na; } void netmap_mitigation_start(struct nm_generic_mit *mit) { ND("called"); } void netmap_mitigation_restart(struct nm_generic_mit *mit) { ND("called"); } int netmap_mitigation_active(struct nm_generic_mit *mit) { ND("called"); return 0; } void netmap_mitigation_cleanup(struct nm_generic_mit *mit) { ND("called"); } static int nm_vi_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) { return EINVAL; } static void nm_vi_start(struct ifnet *ifp) { panic("nm_vi_start() must not be called"); } /* * Index manager of persistent virtual interfaces. * It is used to decide the lowest byte of the MAC address. * We use the same algorithm with management of bridge port index. */ #define NM_VI_MAX 255 static struct { uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */ uint8_t active; struct mtx lock; } nm_vi_indices; void nm_vi_init_index(void) { int i; for (i = 0; i < NM_VI_MAX; i++) nm_vi_indices.index[i] = i; nm_vi_indices.active = 0; mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF); } /* return -1 if no index available */ static int nm_vi_get_index(void) { int ret; mtx_lock(&nm_vi_indices.lock); ret = nm_vi_indices.active == NM_VI_MAX ? -1 : nm_vi_indices.index[nm_vi_indices.active++]; mtx_unlock(&nm_vi_indices.lock); return ret; } static void nm_vi_free_index(uint8_t val) { int i, lim; mtx_lock(&nm_vi_indices.lock); lim = nm_vi_indices.active; for (i = 0; i < lim; i++) { if (nm_vi_indices.index[i] == val) { /* swap index[lim-1] and j */ int tmp = nm_vi_indices.index[lim-1]; nm_vi_indices.index[lim-1] = val; nm_vi_indices.index[i] = tmp; nm_vi_indices.active--; break; } } if (lim == nm_vi_indices.active) D("funny, index %u didn't found", val); mtx_unlock(&nm_vi_indices.lock); } #undef NM_VI_MAX /* * Implementation of a netmap-capable virtual interface that * registered to the system. * It is based on if_tap.c and ip_fw_log.c in FreeBSD 9. * * Note: Linux sets refcount to 0 on allocation of net_device, * then increments it on registration to the system. * FreeBSD sets refcount to 1 on if_alloc(), and does not * increment this refcount on if_attach(). */ int nm_vi_persist(const char *name, struct ifnet **ret) { struct ifnet *ifp; u_short macaddr_hi; uint32_t macaddr_mid; u_char eaddr[6]; int unit = nm_vi_get_index(); /* just to decide MAC address */ if (unit < 0) return EBUSY; /* * We use the same MAC address generation method with tap * except for the highest octet is 00:be instead of 00:bd */ macaddr_hi = htons(0x00be); /* XXX tap + 1 */ macaddr_mid = (uint32_t) ticks; bcopy(&macaddr_hi, eaddr, sizeof(short)); bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); eaddr[5] = (uint8_t)unit; ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { D("if_alloc failed"); return ENOMEM; } if_initname(ifp, name, IF_DUNIT_NONE); ifp->if_mtu = 65536; ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = (void *)nm_vi_dummy; ifp->if_ioctl = nm_vi_dummy; ifp->if_start = nm_vi_start; ifp->if_mtu = ETHERMTU; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; ether_ifattach(ifp, eaddr); *ret = ifp; return 0; } /* unregister from the system and drop the final refcount */ void nm_vi_detach(struct ifnet *ifp) { nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]); ether_ifdetach(ifp); if_free(ifp); } /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and * destructor. */ struct netmap_vm_handle_t { struct cdev *dev; struct netmap_priv_d *priv; }; static int netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { struct netmap_vm_handle_t *vmh = handle; if (netmap_verbose) D("handle %p size %jd prot %d foff %jd", handle, (intmax_t)size, prot, (intmax_t)foff); if (color) *color = 0; dev_ref(vmh->dev); return 0; } static void netmap_dev_pager_dtor(void *handle) { struct netmap_vm_handle_t *vmh = handle; struct cdev *dev = vmh->dev; struct netmap_priv_d *priv = vmh->priv; if (netmap_verbose) D("handle %p", handle); netmap_dtor(priv); free(vmh, M_DEVBUF); dev_rel(dev); } static int netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct netmap_vm_handle_t *vmh = object->handle; struct netmap_priv_d *priv = vmh->priv; vm_paddr_t paddr; vm_page_t page; vm_memattr_t memattr; vm_pindex_t pidx; ND("object %p offset %jd prot %d mres %p", object, (intmax_t)offset, prot, mres); memattr = object->memattr; pidx = OFF_TO_IDX(offset); paddr = netmap_mem_ofstophys(priv->np_mref, offset); if (paddr == 0) return VM_PAGER_FAIL; if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake page, update it with * the new physical address. */ page = *mres; vm_page_updatefake(page, paddr, memattr); } else { /* * Replace the passed in reqpage page with our own fake page and * free up the all of the original pages. */ #ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK #define VM_OBJECT_WLOCK VM_OBJECT_LOCK #endif /* VM_OBJECT_WUNLOCK */ VM_OBJECT_WUNLOCK(object); page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); vm_page_lock(*mres); vm_page_free(*mres); vm_page_unlock(*mres); *mres = page; vm_page_insert(page, object, pidx); } page->valid = VM_PAGE_BITS_ALL; return (VM_PAGER_OK); } static struct cdev_pager_ops netmap_cdev_pager_ops = { .cdev_pg_ctor = netmap_dev_pager_ctor, .cdev_pg_dtor = netmap_dev_pager_dtor, .cdev_pg_fault = netmap_dev_pager_fault, }; static int netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, vm_size_t objsize, vm_object_t *objp, int prot) { int error; struct netmap_vm_handle_t *vmh; struct netmap_priv_d *priv; vm_object_t obj; if (netmap_verbose) D("cdev %p foff %jd size %jd objp %p prot %d", cdev, (intmax_t )*foff, (intmax_t )objsize, objp, prot); vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (vmh == NULL) return ENOMEM; vmh->dev = cdev; NMG_LOCK(); error = devfs_get_cdevpriv((void**)&priv); if (error) goto err_unlock; vmh->priv = priv; priv->np_refcount++; NMG_UNLOCK(); error = netmap_get_memory(priv); if (error) goto err_deref; obj = cdev_pager_allocate(vmh, OBJT_DEVICE, &netmap_cdev_pager_ops, objsize, prot, *foff, NULL); if (obj == NULL) { D("cdev_pager_allocate failed"); error = EINVAL; goto err_deref; } *objp = obj; return 0; err_deref: NMG_LOCK(); priv->np_refcount--; err_unlock: NMG_UNLOCK(); // err: free(vmh, M_DEVBUF); return error; } // XXX can we remove this ? static int netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { if (netmap_verbose) D("dev %p fflag 0x%x devtype %d td %p", dev, fflag, devtype, td); return 0; } static int netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct netmap_priv_d *priv; int error; (void)dev; (void)oflags; (void)devtype; (void)td; // XXX wait or nowait ? priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, M_NOWAIT | M_ZERO); if (priv == NULL) return ENOMEM; error = devfs_set_cdevpriv(priv, netmap_dtor); if (error) return error; priv->np_refcount = 1; return 0; } /******************** kqueue support ****************/ /* * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. * We use a non-zero argument to distinguish the call from the one * in kevent_scan() which instead also needs to run netmap_poll(). * The knote uses a global mutex for the time being. We might * try to reuse the one in the si, but it is not allocated * permanently so it might be a bit tricky. * * The *kqfilter function registers one or another f_event * depending on read or write mode. * In the call to f_event() td_fpop is NULL so any child function * calling devfs_get_cdevpriv() would fail - and we need it in * netmap_poll(). As a workaround we store priv into kn->kn_hook * and pass it as first argument to netmap_poll(), which then * uses the failure to tell that we are called from f_event() * and do not need the selrecord(). */ void freebsd_selwakeup(struct nm_selinfo *si, int pri) { if (netmap_verbose) D("on knote %p", &si->si.si_note); selwakeuppri(&si->si, pri); /* use a non-zero hint to tell the notification from the * call done in kqueue_scan() which uses 0 */ KNOTE_UNLOCKED(&si->si.si_note, 0x100 /* notification */); } static void netmap_knrdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; struct selinfo *si = &priv->np_rxsi->si; D("remove selinfo %p", si); knlist_remove(&si->si_note, kn, 0); } static void netmap_knwdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; struct selinfo *si = &priv->np_txsi->si; D("remove selinfo %p", si); knlist_remove(&si->si_note, kn, 0); } /* * callback from notifies (generated externally) and our * calls to kevent(). The former we just return 1 (ready) * since we do not know better. * In the latter we call netmap_poll and return 0/1 accordingly. */ static int netmap_knrw(struct knote *kn, long hint, int events) { struct netmap_priv_d *priv; int revents; if (hint != 0) { ND(5, "call from notify"); return 1; /* assume we are ready */ } priv = kn->kn_hook; /* the notification may come from an external thread, * in which case we do not want to run the netmap_poll * This should be filtered above, but check just in case. */ if (curthread != priv->np_td) { /* should not happen */ RD(5, "curthread changed %p %p", curthread, priv->np_td); return 1; } else { revents = netmap_poll((void *)priv, events, curthread); return (events & revents) ? 1 : 0; } } static int netmap_knread(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLIN); } static int netmap_knwrite(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLOUT); } static struct filterops netmap_rfiltops = { .f_isfd = 1, .f_detach = netmap_knrdetach, .f_event = netmap_knread, }; static struct filterops netmap_wfiltops = { .f_isfd = 1, .f_detach = netmap_knwdetach, .f_event = netmap_knwrite, }; /* * This is called when a thread invokes kevent() to record * a change in the configuration of the kqueue(). * The 'priv' should be the same as in the netmap device. */ static int netmap_kqfilter(struct cdev *dev, struct knote *kn) { struct netmap_priv_d *priv; int error; struct netmap_adapter *na; struct nm_selinfo *si; int ev = kn->kn_filter; if (ev != EVFILT_READ && ev != EVFILT_WRITE) { D("bad filter request %d", ev); return 1; } error = devfs_get_cdevpriv((void**)&priv); if (error) { D("device not yet setup"); return 1; } na = priv->np_na; if (na == NULL) { D("no netmap adapter for this file descriptor"); return 1; } /* the si is indicated in the priv */ si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi; // XXX lock(priv) ? kn->kn_fop = (ev == EVFILT_WRITE) ? &netmap_wfiltops : &netmap_rfiltops; kn->kn_hook = priv; knlist_add(&si->si.si_note, kn, 1); // XXX unlock(priv) ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s", na, na->ifp->if_xname, curthread, priv, kn, priv->np_nifp, kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH"); return 0; } struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, .d_mmap_single = netmap_mmap_single, .d_ioctl = netmap_ioctl, .d_poll = netmap_poll, .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; /*--- end of kqueue support ----*/ /* * Kernel entry point. * * Initialize/finalize the module and return. * * Return 0 on success, errno on failure. */ static int netmap_loader(__unused struct module *module, int event, __unused void *arg) { int error = 0; switch (event) { case MOD_LOAD: error = netmap_init(); break; case MOD_UNLOAD: netmap_fini(); break; default: error = EOPNOTSUPP; break; } return (error); } DEV_MODULE(netmap, netmap_loader, NULL); Index: head/sys/dev/oce/oce_if.c =================================================================== --- head/sys/dev/oce/oce_if.c (revision 275357) +++ head/sys/dev/oce/oce_if.c (revision 275358) @@ -1,2359 +1,2359 @@ /*- * Copyright (C) 2013 Emulex * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Emulex Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Contact Information: * freebsd-drivers@emulex.com * * Emulex * 3333 Susan Street * Costa Mesa, CA 92626 */ /* $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include "oce_if.h" /* UE Status Low CSR */ static char *ue_status_low_desc[] = { "CEV", "CTX", "DBUF", "ERX", "Host", "MPU", "NDMA", "PTC ", "RDMA ", "RXF ", "RXIPS ", "RXULP0 ", "RXULP1 ", "RXULP2 ", "TIM ", "TPOST ", "TPRE ", "TXIPS ", "TXULP0 ", "TXULP1 ", "UC ", "WDMA ", "TXULP2 ", "HOST1 ", "P0_OB_LINK ", "P1_OB_LINK ", "HOST_GPIO ", "MBOX ", "AXGMAC0", "AXGMAC1", "JTAG", "MPU_INTPEND" }; /* UE Status High CSR */ static char *ue_status_hi_desc[] = { "LPCMEMHOST", "MGMT_MAC", "PCS0ONLINE", "MPU_IRAM", "PCS1ONLINE", "PCTL0", "PCTL1", "PMEM", "RR", "TXPB", "RXPP", "XAUI", "TXP", "ARM", "IPC", "HOST2", "HOST3", "HOST4", "HOST5", "HOST6", "HOST7", "HOST8", "HOST9", "NETC", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown" }; /* Driver entry points prototypes */ static int oce_probe(device_t dev); static int oce_attach(device_t dev); static int oce_detach(device_t dev); static int oce_shutdown(device_t dev); static int oce_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void oce_init(void *xsc); static int oce_multiq_start(struct ifnet *ifp, struct mbuf *m); static void oce_multiq_flush(struct ifnet *ifp); /* Driver interrupt routines protypes */ static void oce_intr(void *arg, int pending); static int oce_setup_intr(POCE_SOFTC sc); static int oce_fast_isr(void *arg); static int oce_alloc_intr(POCE_SOFTC sc, int vector, void (*isr) (void *arg, int pending)); /* Media callbacks prototypes */ static void oce_media_status(struct ifnet *ifp, struct ifmediareq *req); static int oce_media_change(struct ifnet *ifp); /* Transmit routines prototypes */ static int oce_tx(POCE_SOFTC sc, struct mbuf **mpp, int wq_index); static void oce_tx_restart(POCE_SOFTC sc, struct oce_wq *wq); static void oce_tx_complete(struct oce_wq *wq, uint32_t wqe_idx, uint32_t status); static int oce_multiq_transmit(struct ifnet *ifp, struct mbuf *m, struct oce_wq *wq); /* Receive routines prototypes */ static void oce_discard_rx_comp(struct oce_rq *rq, struct oce_nic_rx_cqe *cqe); static int oce_cqe_vtp_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe); static int oce_cqe_portid_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe); static void oce_rx(struct oce_rq *rq, uint32_t rqe_idx, struct oce_nic_rx_cqe *cqe); /* Helper function prototypes in this file */ static int oce_attach_ifp(POCE_SOFTC sc); static void oce_add_vlan(void *arg, struct ifnet *ifp, uint16_t vtag); static void oce_del_vlan(void *arg, struct ifnet *ifp, uint16_t vtag); static int oce_vid_config(POCE_SOFTC sc); static void oce_mac_addr_set(POCE_SOFTC sc); static int oce_handle_passthrough(struct ifnet *ifp, caddr_t data); static void oce_local_timer(void *arg); static void oce_if_deactivate(POCE_SOFTC sc); static void oce_if_activate(POCE_SOFTC sc); static void setup_max_queues_want(POCE_SOFTC sc); static void update_queues_got(POCE_SOFTC sc); static void process_link_state(POCE_SOFTC sc, struct oce_async_cqe_link_state *acqe); static int oce_tx_asic_stall_verify(POCE_SOFTC sc, struct mbuf *m); static void oce_get_config(POCE_SOFTC sc); static struct mbuf *oce_insert_vlan_tag(POCE_SOFTC sc, struct mbuf *m, boolean_t *complete); /* IP specific */ #if defined(INET6) || defined(INET) static int oce_init_lro(POCE_SOFTC sc); static void oce_rx_flush_lro(struct oce_rq *rq); static struct mbuf * oce_tso_setup(POCE_SOFTC sc, struct mbuf **mpp); #endif static device_method_t oce_dispatch[] = { DEVMETHOD(device_probe, oce_probe), DEVMETHOD(device_attach, oce_attach), DEVMETHOD(device_detach, oce_detach), DEVMETHOD(device_shutdown, oce_shutdown), DEVMETHOD_END }; static driver_t oce_driver = { "oce", oce_dispatch, sizeof(OCE_SOFTC) }; static devclass_t oce_devclass; DRIVER_MODULE(oce, pci, oce_driver, oce_devclass, 0, 0); MODULE_DEPEND(oce, pci, 1, 1, 1); MODULE_DEPEND(oce, ether, 1, 1, 1); MODULE_VERSION(oce, 1); /* global vars */ const char component_revision[32] = {"///" COMPONENT_REVISION "///"}; /* Module capabilites and parameters */ uint32_t oce_max_rsp_handled = OCE_MAX_RSP_HANDLED; uint32_t oce_enable_rss = OCE_MODCAP_RSS; TUNABLE_INT("hw.oce.max_rsp_handled", &oce_max_rsp_handled); TUNABLE_INT("hw.oce.enable_rss", &oce_enable_rss); /* Supported devices table */ static uint32_t supportedDevices[] = { (PCI_VENDOR_SERVERENGINES << 16) | PCI_PRODUCT_BE2, (PCI_VENDOR_SERVERENGINES << 16) | PCI_PRODUCT_BE3, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_BE3, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_XE201, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_XE201_VF, (PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_SH }; /***************************************************************************** * Driver entry points functions * *****************************************************************************/ static int oce_probe(device_t dev) { uint16_t vendor = 0; uint16_t device = 0; int i = 0; char str[256] = {0}; POCE_SOFTC sc; sc = device_get_softc(dev); bzero(sc, sizeof(OCE_SOFTC)); sc->dev = dev; vendor = pci_get_vendor(dev); device = pci_get_device(dev); for (i = 0; i < (sizeof(supportedDevices) / sizeof(uint32_t)); i++) { if (vendor == ((supportedDevices[i] >> 16) & 0xffff)) { if (device == (supportedDevices[i] & 0xffff)) { sprintf(str, "%s:%s", "Emulex CNA NIC function", component_revision); device_set_desc_copy(dev, str); switch (device) { case PCI_PRODUCT_BE2: sc->flags |= OCE_FLAGS_BE2; break; case PCI_PRODUCT_BE3: sc->flags |= OCE_FLAGS_BE3; break; case PCI_PRODUCT_XE201: case PCI_PRODUCT_XE201_VF: sc->flags |= OCE_FLAGS_XE201; break; case PCI_PRODUCT_SH: sc->flags |= OCE_FLAGS_SH; break; default: return ENXIO; } return BUS_PROBE_DEFAULT; } } } return ENXIO; } static int oce_attach(device_t dev) { POCE_SOFTC sc; int rc = 0; sc = device_get_softc(dev); rc = oce_hw_pci_alloc(sc); if (rc) return rc; sc->tx_ring_size = OCE_TX_RING_SIZE; sc->rx_ring_size = OCE_RX_RING_SIZE; sc->rq_frag_size = OCE_RQ_BUF_SIZE; sc->flow_control = OCE_DEFAULT_FLOW_CONTROL; sc->promisc = OCE_DEFAULT_PROMISCUOUS; LOCK_CREATE(&sc->bmbx_lock, "Mailbox_lock"); LOCK_CREATE(&sc->dev_lock, "Device_lock"); /* initialise the hardware */ rc = oce_hw_init(sc); if (rc) goto pci_res_free; oce_get_config(sc); setup_max_queues_want(sc); rc = oce_setup_intr(sc); if (rc) goto mbox_free; rc = oce_queue_init_all(sc); if (rc) goto intr_free; rc = oce_attach_ifp(sc); if (rc) goto queues_free; #if defined(INET6) || defined(INET) rc = oce_init_lro(sc); if (rc) goto ifp_free; #endif rc = oce_hw_start(sc); if (rc) goto lro_free; sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, oce_add_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, oce_del_vlan, sc, EVENTHANDLER_PRI_FIRST); rc = oce_stats_init(sc); if (rc) goto vlan_free; oce_add_sysctls(sc); callout_init(&sc->timer, CALLOUT_MPSAFE); rc = callout_reset(&sc->timer, 2 * hz, oce_local_timer, sc); if (rc) goto stats_free; return 0; stats_free: callout_drain(&sc->timer); oce_stats_free(sc); vlan_free: if (sc->vlan_attach) EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); if (sc->vlan_detach) EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); oce_hw_intr_disable(sc); lro_free: #if defined(INET6) || defined(INET) oce_free_lro(sc); ifp_free: #endif ether_ifdetach(sc->ifp); if_free(sc->ifp); queues_free: oce_queue_release_all(sc); intr_free: oce_intr_free(sc); mbox_free: oce_dma_free(sc, &sc->bsmbx); pci_res_free: oce_hw_pci_free(sc); LOCK_DESTROY(&sc->dev_lock); LOCK_DESTROY(&sc->bmbx_lock); return rc; } static int oce_detach(device_t dev) { POCE_SOFTC sc = device_get_softc(dev); LOCK(&sc->dev_lock); oce_if_deactivate(sc); UNLOCK(&sc->dev_lock); callout_drain(&sc->timer); if (sc->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); if (sc->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); ether_ifdetach(sc->ifp); if_free(sc->ifp); oce_hw_shutdown(sc); bus_generic_detach(dev); return 0; } static int oce_shutdown(device_t dev) { int rc; rc = oce_detach(dev); return rc; } static int oce_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; POCE_SOFTC sc = ifp->if_softc; int rc = 0; uint32_t u; switch (command) { case SIOCGIFMEDIA: rc = ifmedia_ioctl(ifp, ifr, &sc->media, command); break; case SIOCSIFMTU: if (ifr->ifr_mtu > OCE_MAX_MTU) rc = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; break; case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; oce_init(sc); } device_printf(sc->dev, "Interface Up\n"); } else { LOCK(&sc->dev_lock); sc->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); oce_if_deactivate(sc); UNLOCK(&sc->dev_lock); device_printf(sc->dev, "Interface Down\n"); } if ((ifp->if_flags & IFF_PROMISC) && !sc->promisc) { if (!oce_rxf_set_promiscuous(sc, (1 | (1 << 1)))) sc->promisc = TRUE; } else if (!(ifp->if_flags & IFF_PROMISC) && sc->promisc) { if (!oce_rxf_set_promiscuous(sc, 0)) sc->promisc = FALSE; } break; case SIOCADDMULTI: case SIOCDELMULTI: rc = oce_hw_update_multicast(sc); if (rc) device_printf(sc->dev, "Update multicast address failed\n"); break; case SIOCSIFCAP: u = ifr->ifr_reqcap ^ ifp->if_capenable; if (u & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO; ifp->if_hwassist &= ~CSUM_TSO; if_printf(ifp, "TSO disabled due to -txcsum.\n"); } } if (u & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (u & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (IFCAP_TSO & ifp->if_capenable) { if (IFCAP_TXCSUM & ifp->if_capenable) ifp->if_hwassist |= CSUM_TSO; else { ifp->if_capenable &= ~IFCAP_TSO; ifp->if_hwassist &= ~CSUM_TSO; if_printf(ifp, "Enable txcsum first.\n"); rc = EAGAIN; } } else ifp->if_hwassist &= ~CSUM_TSO; } if (u & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (u & IFCAP_VLAN_HWFILTER) { ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; oce_vid_config(sc); } #if defined(INET6) || defined(INET) if (u & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; #endif break; case SIOCGPRIVATE_0: rc = oce_handle_passthrough(ifp, data); break; default: rc = ether_ioctl(ifp, command, data); break; } return rc; } static void oce_init(void *arg) { POCE_SOFTC sc = arg; LOCK(&sc->dev_lock); if (sc->ifp->if_flags & IFF_UP) { oce_if_deactivate(sc); oce_if_activate(sc); } UNLOCK(&sc->dev_lock); } static int oce_multiq_start(struct ifnet *ifp, struct mbuf *m) { POCE_SOFTC sc = ifp->if_softc; struct oce_wq *wq = NULL; int queue_index = 0; int status = 0; - if ((m->m_flags & M_FLOWID) != 0) + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) queue_index = m->m_pkthdr.flowid % sc->nwqs; wq = sc->wq[queue_index]; LOCK(&wq->tx_lock); status = oce_multiq_transmit(ifp, m, wq); UNLOCK(&wq->tx_lock); return status; } static void oce_multiq_flush(struct ifnet *ifp) { POCE_SOFTC sc = ifp->if_softc; struct mbuf *m; int i = 0; for (i = 0; i < sc->nwqs; i++) { while ((m = buf_ring_dequeue_sc(sc->wq[i]->br)) != NULL) m_freem(m); } if_qflush(ifp); } /***************************************************************************** * Driver interrupt routines functions * *****************************************************************************/ static void oce_intr(void *arg, int pending) { POCE_INTR_INFO ii = (POCE_INTR_INFO) arg; POCE_SOFTC sc = ii->sc; struct oce_eq *eq = ii->eq; struct oce_eqe *eqe; struct oce_cq *cq = NULL; int i, num_eqes = 0; bus_dmamap_sync(eq->ring->dma.tag, eq->ring->dma.map, BUS_DMASYNC_POSTWRITE); do { eqe = RING_GET_CONSUMER_ITEM_VA(eq->ring, struct oce_eqe); if (eqe->evnt == 0) break; eqe->evnt = 0; bus_dmamap_sync(eq->ring->dma.tag, eq->ring->dma.map, BUS_DMASYNC_POSTWRITE); RING_GET(eq->ring, 1); num_eqes++; } while (TRUE); if (!num_eqes) goto eq_arm; /* Spurious */ /* Clear EQ entries, but dont arm */ oce_arm_eq(sc, eq->eq_id, num_eqes, FALSE, FALSE); /* Process TX, RX and MCC. But dont arm CQ*/ for (i = 0; i < eq->cq_valid; i++) { cq = eq->cq[i]; (*cq->cq_handler)(cq->cb_arg); } /* Arm all cqs connected to this EQ */ for (i = 0; i < eq->cq_valid; i++) { cq = eq->cq[i]; oce_arm_cq(sc, cq->cq_id, 0, TRUE); } eq_arm: oce_arm_eq(sc, eq->eq_id, 0, TRUE, FALSE); return; } static int oce_setup_intr(POCE_SOFTC sc) { int rc = 0, use_intx = 0; int vector = 0, req_vectors = 0; if (is_rss_enabled(sc)) req_vectors = MAX((sc->nrqs - 1), sc->nwqs); else req_vectors = 1; if (sc->flags & OCE_FLAGS_MSIX_CAPABLE) { sc->intr_count = req_vectors; rc = pci_alloc_msix(sc->dev, &sc->intr_count); if (rc != 0) { use_intx = 1; pci_release_msi(sc->dev); } else sc->flags |= OCE_FLAGS_USING_MSIX; } else use_intx = 1; if (use_intx) sc->intr_count = 1; /* Scale number of queues based on intr we got */ update_queues_got(sc); if (use_intx) { device_printf(sc->dev, "Using legacy interrupt\n"); rc = oce_alloc_intr(sc, vector, oce_intr); if (rc) goto error; } else { for (; vector < sc->intr_count; vector++) { rc = oce_alloc_intr(sc, vector, oce_intr); if (rc) goto error; } } return 0; error: oce_intr_free(sc); return rc; } static int oce_fast_isr(void *arg) { POCE_INTR_INFO ii = (POCE_INTR_INFO) arg; POCE_SOFTC sc = ii->sc; if (ii->eq == NULL) return FILTER_STRAY; oce_arm_eq(sc, ii->eq->eq_id, 0, FALSE, TRUE); taskqueue_enqueue_fast(ii->tq, &ii->task); ii->eq->intr++; return FILTER_HANDLED; } static int oce_alloc_intr(POCE_SOFTC sc, int vector, void (*isr) (void *arg, int pending)) { POCE_INTR_INFO ii = &sc->intrs[vector]; int rc = 0, rr; if (vector >= OCE_MAX_EQ) return (EINVAL); /* Set the resource id for the interrupt. * MSIx is vector + 1 for the resource id, * INTx is 0 for the resource id. */ if (sc->flags & OCE_FLAGS_USING_MSIX) rr = vector + 1; else rr = 0; ii->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rr, RF_ACTIVE|RF_SHAREABLE); ii->irq_rr = rr; if (ii->intr_res == NULL) { device_printf(sc->dev, "Could not allocate interrupt\n"); rc = ENXIO; return rc; } TASK_INIT(&ii->task, 0, isr, ii); ii->vector = vector; sprintf(ii->task_name, "oce_task[%d]", ii->vector); ii->tq = taskqueue_create_fast(ii->task_name, M_NOWAIT, taskqueue_thread_enqueue, &ii->tq); taskqueue_start_threads(&ii->tq, 1, PI_NET, "%s taskq", device_get_nameunit(sc->dev)); ii->sc = sc; rc = bus_setup_intr(sc->dev, ii->intr_res, INTR_TYPE_NET, oce_fast_isr, NULL, ii, &ii->tag); return rc; } void oce_intr_free(POCE_SOFTC sc) { int i = 0; for (i = 0; i < sc->intr_count; i++) { if (sc->intrs[i].tag != NULL) bus_teardown_intr(sc->dev, sc->intrs[i].intr_res, sc->intrs[i].tag); if (sc->intrs[i].tq != NULL) taskqueue_free(sc->intrs[i].tq); if (sc->intrs[i].intr_res != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, sc->intrs[i].irq_rr, sc->intrs[i].intr_res); sc->intrs[i].tag = NULL; sc->intrs[i].intr_res = NULL; } if (sc->flags & OCE_FLAGS_USING_MSIX) pci_release_msi(sc->dev); } /****************************************************************************** * Media callbacks functions * ******************************************************************************/ static void oce_media_status(struct ifnet *ifp, struct ifmediareq *req) { POCE_SOFTC sc = (POCE_SOFTC) ifp->if_softc; req->ifm_status = IFM_AVALID; req->ifm_active = IFM_ETHER; if (sc->link_status == 1) req->ifm_status |= IFM_ACTIVE; else return; switch (sc->link_speed) { case 1: /* 10 Mbps */ req->ifm_active |= IFM_10_T | IFM_FDX; sc->speed = 10; break; case 2: /* 100 Mbps */ req->ifm_active |= IFM_100_TX | IFM_FDX; sc->speed = 100; break; case 3: /* 1 Gbps */ req->ifm_active |= IFM_1000_T | IFM_FDX; sc->speed = 1000; break; case 4: /* 10 Gbps */ req->ifm_active |= IFM_10G_SR | IFM_FDX; sc->speed = 10000; break; case 5: /* 20 Gbps */ req->ifm_active |= IFM_10G_SR | IFM_FDX; sc->speed = 20000; break; case 6: /* 25 Gbps */ req->ifm_active |= IFM_10G_SR | IFM_FDX; sc->speed = 25000; break; case 7: /* 40 Gbps */ req->ifm_active |= IFM_40G_SR4 | IFM_FDX; sc->speed = 40000; break; default: sc->speed = 0; break; } return; } int oce_media_change(struct ifnet *ifp) { return 0; } /***************************************************************************** * Transmit routines functions * *****************************************************************************/ static int oce_tx(POCE_SOFTC sc, struct mbuf **mpp, int wq_index) { int rc = 0, i, retry_cnt = 0; bus_dma_segment_t segs[OCE_MAX_TX_ELEMENTS]; struct mbuf *m, *m_temp; struct oce_wq *wq = sc->wq[wq_index]; struct oce_packet_desc *pd; struct oce_nic_hdr_wqe *nichdr; struct oce_nic_frag_wqe *nicfrag; int num_wqes; uint32_t reg_value; boolean_t complete = TRUE; m = *mpp; if (!m) return EINVAL; if (!(m->m_flags & M_PKTHDR)) { rc = ENXIO; goto free_ret; } if(oce_tx_asic_stall_verify(sc, m)) { m = oce_insert_vlan_tag(sc, m, &complete); if(!m) { device_printf(sc->dev, "Insertion unsuccessful\n"); return 0; } } if (m->m_pkthdr.csum_flags & CSUM_TSO) { /* consolidate packet buffers for TSO/LSO segment offload */ #if defined(INET6) || defined(INET) m = oce_tso_setup(sc, mpp); #else m = NULL; #endif if (m == NULL) { rc = ENXIO; goto free_ret; } } pd = &wq->pckts[wq->pkt_desc_head]; retry: rc = bus_dmamap_load_mbuf_sg(wq->tag, pd->map, m, segs, &pd->nsegs, BUS_DMA_NOWAIT); if (rc == 0) { num_wqes = pd->nsegs + 1; if (IS_BE(sc) || IS_SH(sc)) { /*Dummy required only for BE3.*/ if (num_wqes & 1) num_wqes++; } if (num_wqes >= RING_NUM_FREE(wq->ring)) { bus_dmamap_unload(wq->tag, pd->map); return EBUSY; } atomic_store_rel_int(&wq->pkt_desc_head, (wq->pkt_desc_head + 1) % \ OCE_WQ_PACKET_ARRAY_SIZE); bus_dmamap_sync(wq->tag, pd->map, BUS_DMASYNC_PREWRITE); pd->mbuf = m; nichdr = RING_GET_PRODUCER_ITEM_VA(wq->ring, struct oce_nic_hdr_wqe); nichdr->u0.dw[0] = 0; nichdr->u0.dw[1] = 0; nichdr->u0.dw[2] = 0; nichdr->u0.dw[3] = 0; nichdr->u0.s.complete = complete; nichdr->u0.s.event = 1; nichdr->u0.s.crc = 1; nichdr->u0.s.forward = 0; nichdr->u0.s.ipcs = (m->m_pkthdr.csum_flags & CSUM_IP) ? 1 : 0; nichdr->u0.s.udpcs = (m->m_pkthdr.csum_flags & CSUM_UDP) ? 1 : 0; nichdr->u0.s.tcpcs = (m->m_pkthdr.csum_flags & CSUM_TCP) ? 1 : 0; nichdr->u0.s.num_wqe = num_wqes; nichdr->u0.s.total_length = m->m_pkthdr.len; if (m->m_flags & M_VLANTAG) { nichdr->u0.s.vlan = 1; /*Vlan present*/ nichdr->u0.s.vlan_tag = m->m_pkthdr.ether_vtag; } if (m->m_pkthdr.csum_flags & CSUM_TSO) { if (m->m_pkthdr.tso_segsz) { nichdr->u0.s.lso = 1; nichdr->u0.s.lso_mss = m->m_pkthdr.tso_segsz; } if (!IS_BE(sc) || !IS_SH(sc)) nichdr->u0.s.ipcs = 1; } RING_PUT(wq->ring, 1); atomic_add_int(&wq->ring->num_used, 1); for (i = 0; i < pd->nsegs; i++) { nicfrag = RING_GET_PRODUCER_ITEM_VA(wq->ring, struct oce_nic_frag_wqe); nicfrag->u0.s.rsvd0 = 0; nicfrag->u0.s.frag_pa_hi = ADDR_HI(segs[i].ds_addr); nicfrag->u0.s.frag_pa_lo = ADDR_LO(segs[i].ds_addr); nicfrag->u0.s.frag_len = segs[i].ds_len; pd->wqe_idx = wq->ring->pidx; RING_PUT(wq->ring, 1); atomic_add_int(&wq->ring->num_used, 1); } if (num_wqes > (pd->nsegs + 1)) { nicfrag = RING_GET_PRODUCER_ITEM_VA(wq->ring, struct oce_nic_frag_wqe); nicfrag->u0.dw[0] = 0; nicfrag->u0.dw[1] = 0; nicfrag->u0.dw[2] = 0; nicfrag->u0.dw[3] = 0; pd->wqe_idx = wq->ring->pidx; RING_PUT(wq->ring, 1); atomic_add_int(&wq->ring->num_used, 1); pd->nsegs++; } if_inc_counter(sc->ifp, IFCOUNTER_OPACKETS, 1); wq->tx_stats.tx_reqs++; wq->tx_stats.tx_wrbs += num_wqes; wq->tx_stats.tx_bytes += m->m_pkthdr.len; wq->tx_stats.tx_pkts++; bus_dmamap_sync(wq->ring->dma.tag, wq->ring->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); reg_value = (num_wqes << 16) | wq->wq_id; OCE_WRITE_REG32(sc, db, wq->db_offset, reg_value); } else if (rc == EFBIG) { if (retry_cnt == 0) { m_temp = m_defrag(m, M_NOWAIT); if (m_temp == NULL) goto free_ret; m = m_temp; *mpp = m_temp; retry_cnt = retry_cnt + 1; goto retry; } else goto free_ret; } else if (rc == ENOMEM) return rc; else goto free_ret; return 0; free_ret: m_freem(*mpp); *mpp = NULL; return rc; } static void oce_tx_complete(struct oce_wq *wq, uint32_t wqe_idx, uint32_t status) { struct oce_packet_desc *pd; POCE_SOFTC sc = (POCE_SOFTC) wq->parent; struct mbuf *m; pd = &wq->pckts[wq->pkt_desc_tail]; atomic_store_rel_int(&wq->pkt_desc_tail, (wq->pkt_desc_tail + 1) % OCE_WQ_PACKET_ARRAY_SIZE); atomic_subtract_int(&wq->ring->num_used, pd->nsegs + 1); bus_dmamap_sync(wq->tag, pd->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(wq->tag, pd->map); m = pd->mbuf; m_freem(m); pd->mbuf = NULL; if (sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) { if (wq->ring->num_used < (wq->ring->num_items / 2)) { sc->ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); oce_tx_restart(sc, wq); } } } static void oce_tx_restart(POCE_SOFTC sc, struct oce_wq *wq) { if ((sc->ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) return; #if __FreeBSD_version >= 800000 if (!drbr_empty(sc->ifp, wq->br)) #else if (!IFQ_DRV_IS_EMPTY(&sc->ifp->if_snd)) #endif taskqueue_enqueue_fast(taskqueue_swi, &wq->txtask); } #if defined(INET6) || defined(INET) static struct mbuf * oce_tso_setup(POCE_SOFTC sc, struct mbuf **mpp) { struct mbuf *m; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif struct ether_vlan_header *eh; struct tcphdr *th; uint16_t etype; int total_len = 0, ehdrlen = 0; m = *mpp; if (M_WRITABLE(m) == 0) { m = m_dup(*mpp, M_NOWAIT); if (!m) return NULL; m_freem(*mpp); *mpp = m; } eh = mtod(m, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } switch (etype) { #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(m->m_data + ehdrlen); if (ip->ip_p != IPPROTO_TCP) return NULL; th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); total_len = ehdrlen + (ip->ip_hl << 2) + (th->th_off << 2); break; #endif #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(m->m_data + ehdrlen); if (ip6->ip6_nxt != IPPROTO_TCP) return NULL; th = (struct tcphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); total_len = ehdrlen + sizeof(struct ip6_hdr) + (th->th_off << 2); break; #endif default: return NULL; } m = m_pullup(m, total_len); if (!m) return NULL; *mpp = m; return m; } #endif /* INET6 || INET */ void oce_tx_task(void *arg, int npending) { struct oce_wq *wq = arg; POCE_SOFTC sc = wq->parent; struct ifnet *ifp = sc->ifp; int rc = 0; #if __FreeBSD_version >= 800000 LOCK(&wq->tx_lock); rc = oce_multiq_transmit(ifp, NULL, wq); if (rc) { device_printf(sc->dev, "TX[%d] restart failed\n", wq->queue_index); } UNLOCK(&wq->tx_lock); #else oce_start(ifp); #endif } void oce_start(struct ifnet *ifp) { POCE_SOFTC sc = ifp->if_softc; struct mbuf *m; int rc = 0; int def_q = 0; /* Defualt tx queue is 0*/ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!sc->link_status) return; do { IF_DEQUEUE(&sc->ifp->if_snd, m); if (m == NULL) break; LOCK(&sc->wq[def_q]->tx_lock); rc = oce_tx(sc, &m, def_q); UNLOCK(&sc->wq[def_q]->tx_lock); if (rc) { if (m != NULL) { sc->wq[def_q]->tx_stats.tx_stops ++; ifp->if_drv_flags |= IFF_DRV_OACTIVE; IFQ_DRV_PREPEND(&ifp->if_snd, m); m = NULL; } break; } if (m != NULL) ETHER_BPF_MTAP(ifp, m); } while (TRUE); return; } /* Handle the Completion Queue for transmit */ uint16_t oce_wq_handler(void *arg) { struct oce_wq *wq = (struct oce_wq *)arg; POCE_SOFTC sc = wq->parent; struct oce_cq *cq = wq->cq; struct oce_nic_tx_cqe *cqe; int num_cqes = 0; bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_tx_cqe); while (cqe->u0.dw[3]) { DW_SWAP((uint32_t *) cqe, sizeof(oce_wq_cqe)); wq->ring->cidx = cqe->u0.s.wqe_index + 1; if (wq->ring->cidx >= wq->ring->num_items) wq->ring->cidx -= wq->ring->num_items; oce_tx_complete(wq, cqe->u0.s.wqe_index, cqe->u0.s.status); wq->tx_stats.tx_compl++; cqe->u0.dw[3] = 0; RING_GET(cq->ring, 1); bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_tx_cqe); num_cqes++; } if (num_cqes) oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE); return 0; } static int oce_multiq_transmit(struct ifnet *ifp, struct mbuf *m, struct oce_wq *wq) { POCE_SOFTC sc = ifp->if_softc; int status = 0, queue_index = 0; struct mbuf *next = NULL; struct buf_ring *br = NULL; br = wq->br; queue_index = wq->queue_index; if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) { if (m != NULL) status = drbr_enqueue(ifp, br, m); return status; } if (m != NULL) { if ((status = drbr_enqueue(ifp, br, m)) != 0) return status; } while ((next = drbr_peek(ifp, br)) != NULL) { if (oce_tx(sc, &next, queue_index)) { if (next == NULL) { drbr_advance(ifp, br); } else { drbr_putback(ifp, br, next); wq->tx_stats.tx_stops ++; ifp->if_drv_flags |= IFF_DRV_OACTIVE; } break; } drbr_advance(ifp, br); if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); if (next->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); ETHER_BPF_MTAP(ifp, next); } return 0; } /***************************************************************************** * Receive routines functions * *****************************************************************************/ static void oce_rx(struct oce_rq *rq, uint32_t rqe_idx, struct oce_nic_rx_cqe *cqe) { uint32_t out; struct oce_packet_desc *pd; POCE_SOFTC sc = (POCE_SOFTC) rq->parent; int i, len, frag_len; struct mbuf *m = NULL, *tail = NULL; uint16_t vtag; len = cqe->u0.s.pkt_size; if (!len) { /*partial DMA workaround for Lancer*/ oce_discard_rx_comp(rq, cqe); goto exit; } /* Get vlan_tag value */ if(IS_BE(sc) || IS_SH(sc)) vtag = BSWAP_16(cqe->u0.s.vlan_tag); else vtag = cqe->u0.s.vlan_tag; for (i = 0; i < cqe->u0.s.num_fragments; i++) { if (rq->packets_out == rq->packets_in) { device_printf(sc->dev, "RQ transmit descriptor missing\n"); } out = rq->packets_out + 1; if (out == OCE_RQ_PACKET_ARRAY_SIZE) out = 0; pd = &rq->pckts[rq->packets_out]; rq->packets_out = out; bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(rq->tag, pd->map); rq->pending--; frag_len = (len > rq->cfg.frag_size) ? rq->cfg.frag_size : len; pd->mbuf->m_len = frag_len; if (tail != NULL) { /* additional fragments */ pd->mbuf->m_flags &= ~M_PKTHDR; tail->m_next = pd->mbuf; tail = pd->mbuf; } else { /* first fragment, fill out much of the packet header */ pd->mbuf->m_pkthdr.len = len; pd->mbuf->m_pkthdr.csum_flags = 0; if (IF_CSUM_ENABLED(sc)) { if (cqe->u0.s.l4_cksum_pass) { pd->mbuf->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); pd->mbuf->m_pkthdr.csum_data = 0xffff; } if (cqe->u0.s.ip_cksum_pass) { if (!cqe->u0.s.ip_ver) { /* IPV4 */ pd->mbuf->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); } } } m = tail = pd->mbuf; } pd->mbuf = NULL; len -= frag_len; } if (m) { if (!oce_cqe_portid_valid(sc, cqe)) { m_freem(m); goto exit; } m->m_pkthdr.rcvif = sc->ifp; #if __FreeBSD_version >= 800000 if (rq->queue_index) m->m_pkthdr.flowid = (rq->queue_index - 1); else m->m_pkthdr.flowid = rq->queue_index; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); #endif /* This deternies if vlan tag is Valid */ if (oce_cqe_vtp_valid(sc, cqe)) { if (sc->function_mode & FNM_FLEX10_MODE) { /* FLEX10. If QnQ is not set, neglect VLAN */ if (cqe->u0.s.qnq) { m->m_pkthdr.ether_vtag = vtag; m->m_flags |= M_VLANTAG; } } else if (sc->pvid != (vtag & VLAN_VID_MASK)) { /* In UMC mode generally pvid will be striped by hw. But in some cases we have seen it comes with pvid. So if pvid == vlan, neglect vlan. */ m->m_pkthdr.ether_vtag = vtag; m->m_flags |= M_VLANTAG; } } if_inc_counter(sc->ifp, IFCOUNTER_IPACKETS, 1); #if defined(INET6) || defined(INET) /* Try to queue to LRO */ if (IF_LRO_ENABLED(sc) && (cqe->u0.s.ip_cksum_pass) && (cqe->u0.s.l4_cksum_pass) && (!cqe->u0.s.ip_ver) && (rq->lro.lro_cnt != 0)) { if (tcp_lro_rx(&rq->lro, m, 0) == 0) { rq->lro_pkts_queued ++; goto post_done; } /* If LRO posting fails then try to post to STACK */ } #endif (*sc->ifp->if_input) (sc->ifp, m); #if defined(INET6) || defined(INET) post_done: #endif /* Update rx stats per queue */ rq->rx_stats.rx_pkts++; rq->rx_stats.rx_bytes += cqe->u0.s.pkt_size; rq->rx_stats.rx_frags += cqe->u0.s.num_fragments; if (cqe->u0.s.pkt_type == OCE_MULTICAST_PACKET) rq->rx_stats.rx_mcast_pkts++; if (cqe->u0.s.pkt_type == OCE_UNICAST_PACKET) rq->rx_stats.rx_ucast_pkts++; } exit: return; } static void oce_discard_rx_comp(struct oce_rq *rq, struct oce_nic_rx_cqe *cqe) { uint32_t out, i = 0; struct oce_packet_desc *pd; POCE_SOFTC sc = (POCE_SOFTC) rq->parent; int num_frags = cqe->u0.s.num_fragments; for (i = 0; i < num_frags; i++) { if (rq->packets_out == rq->packets_in) { device_printf(sc->dev, "RQ transmit descriptor missing\n"); } out = rq->packets_out + 1; if (out == OCE_RQ_PACKET_ARRAY_SIZE) out = 0; pd = &rq->pckts[rq->packets_out]; rq->packets_out = out; bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(rq->tag, pd->map); rq->pending--; m_freem(pd->mbuf); } } static int oce_cqe_vtp_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe) { struct oce_nic_rx_cqe_v1 *cqe_v1; int vtp = 0; if (sc->be3_native) { cqe_v1 = (struct oce_nic_rx_cqe_v1 *)cqe; vtp = cqe_v1->u0.s.vlan_tag_present; } else vtp = cqe->u0.s.vlan_tag_present; return vtp; } static int oce_cqe_portid_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe) { struct oce_nic_rx_cqe_v1 *cqe_v1; int port_id = 0; if (sc->be3_native && (IS_BE(sc) || IS_SH(sc))) { cqe_v1 = (struct oce_nic_rx_cqe_v1 *)cqe; port_id = cqe_v1->u0.s.port; if (sc->port_id != port_id) return 0; } else ;/* For BE3 legacy and Lancer this is dummy */ return 1; } #if defined(INET6) || defined(INET) static void oce_rx_flush_lro(struct oce_rq *rq) { struct lro_ctrl *lro = &rq->lro; struct lro_entry *queued; POCE_SOFTC sc = (POCE_SOFTC) rq->parent; if (!IF_LRO_ENABLED(sc)) return; while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } rq->lro_pkts_queued = 0; return; } static int oce_init_lro(POCE_SOFTC sc) { struct lro_ctrl *lro = NULL; int i = 0, rc = 0; for (i = 0; i < sc->nrqs; i++) { lro = &sc->rq[i]->lro; rc = tcp_lro_init(lro); if (rc != 0) { device_printf(sc->dev, "LRO init failed\n"); return rc; } lro->ifp = sc->ifp; } return rc; } void oce_free_lro(POCE_SOFTC sc) { struct lro_ctrl *lro = NULL; int i = 0; for (i = 0; i < sc->nrqs; i++) { lro = &sc->rq[i]->lro; if (lro) tcp_lro_free(lro); } } #endif int oce_alloc_rx_bufs(struct oce_rq *rq, int count) { POCE_SOFTC sc = (POCE_SOFTC) rq->parent; int i, in, rc; struct oce_packet_desc *pd; bus_dma_segment_t segs[6]; int nsegs, added = 0; struct oce_nic_rqe *rqe; pd_rxulp_db_t rxdb_reg; bzero(&rxdb_reg, sizeof(pd_rxulp_db_t)); for (i = 0; i < count; i++) { in = rq->packets_in + 1; if (in == OCE_RQ_PACKET_ARRAY_SIZE) in = 0; if (in == rq->packets_out) break; /* no more room */ pd = &rq->pckts[rq->packets_in]; pd->mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (pd->mbuf == NULL) break; pd->mbuf->m_len = pd->mbuf->m_pkthdr.len = MCLBYTES; rc = bus_dmamap_load_mbuf_sg(rq->tag, pd->map, pd->mbuf, segs, &nsegs, BUS_DMA_NOWAIT); if (rc) { m_free(pd->mbuf); break; } if (nsegs != 1) { i--; continue; } rq->packets_in = in; bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_PREREAD); rqe = RING_GET_PRODUCER_ITEM_VA(rq->ring, struct oce_nic_rqe); rqe->u0.s.frag_pa_hi = ADDR_HI(segs[0].ds_addr); rqe->u0.s.frag_pa_lo = ADDR_LO(segs[0].ds_addr); DW_SWAP(u32ptr(rqe), sizeof(struct oce_nic_rqe)); RING_PUT(rq->ring, 1); added++; rq->pending++; } if (added != 0) { for (i = added / OCE_MAX_RQ_POSTS; i > 0; i--) { rxdb_reg.bits.num_posted = OCE_MAX_RQ_POSTS; rxdb_reg.bits.qid = rq->rq_id; OCE_WRITE_REG32(sc, db, PD_RXULP_DB, rxdb_reg.dw0); added -= OCE_MAX_RQ_POSTS; } if (added > 0) { rxdb_reg.bits.qid = rq->rq_id; rxdb_reg.bits.num_posted = added; OCE_WRITE_REG32(sc, db, PD_RXULP_DB, rxdb_reg.dw0); } } return 0; } /* Handle the Completion Queue for receive */ uint16_t oce_rq_handler(void *arg) { struct oce_rq *rq = (struct oce_rq *)arg; struct oce_cq *cq = rq->cq; POCE_SOFTC sc = rq->parent; struct oce_nic_rx_cqe *cqe; int num_cqes = 0, rq_buffers_used = 0; bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_rx_cqe); while (cqe->u0.dw[2]) { DW_SWAP((uint32_t *) cqe, sizeof(oce_rq_cqe)); RING_GET(rq->ring, 1); if (cqe->u0.s.error == 0) { oce_rx(rq, cqe->u0.s.frag_index, cqe); } else { rq->rx_stats.rxcp_err++; if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1); /* Post L3/L4 errors to stack.*/ oce_rx(rq, cqe->u0.s.frag_index, cqe); } rq->rx_stats.rx_compl++; cqe->u0.dw[2] = 0; #if defined(INET6) || defined(INET) if (IF_LRO_ENABLED(sc) && rq->lro_pkts_queued >= 16) { oce_rx_flush_lro(rq); } #endif RING_GET(cq->ring, 1); bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_rx_cqe); num_cqes++; if (num_cqes >= (IS_XE201(sc) ? 8 : oce_max_rsp_handled)) break; } #if defined(INET6) || defined(INET) if (IF_LRO_ENABLED(sc)) oce_rx_flush_lro(rq); #endif if (num_cqes) { oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE); rq_buffers_used = OCE_RQ_PACKET_ARRAY_SIZE - rq->pending; if (rq_buffers_used > 1) oce_alloc_rx_bufs(rq, (rq_buffers_used - 1)); } return 0; } /***************************************************************************** * Helper function prototypes in this file * *****************************************************************************/ static int oce_attach_ifp(POCE_SOFTC sc) { sc->ifp = if_alloc(IFT_ETHER); if (!sc->ifp) return ENOMEM; ifmedia_init(&sc->media, IFM_IMASK, oce_media_change, oce_media_status); ifmedia_add(&sc->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_AUTO); sc->ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST; sc->ifp->if_ioctl = oce_ioctl; sc->ifp->if_start = oce_start; sc->ifp->if_init = oce_init; sc->ifp->if_mtu = ETHERMTU; sc->ifp->if_softc = sc; #if __FreeBSD_version >= 800000 sc->ifp->if_transmit = oce_multiq_start; sc->ifp->if_qflush = oce_multiq_flush; #endif if_initname(sc->ifp, device_get_name(sc->dev), device_get_unit(sc->dev)); sc->ifp->if_snd.ifq_drv_maxlen = OCE_MAX_TX_DESC - 1; IFQ_SET_MAXLEN(&sc->ifp->if_snd, sc->ifp->if_snd.ifq_drv_maxlen); IFQ_SET_READY(&sc->ifp->if_snd); sc->ifp->if_hwassist = OCE_IF_HWASSIST; sc->ifp->if_hwassist |= CSUM_TSO; sc->ifp->if_hwassist |= (CSUM_IP | CSUM_TCP | CSUM_UDP); sc->ifp->if_capabilities = OCE_IF_CAPABILITIES; sc->ifp->if_capabilities |= IFCAP_HWCSUM; sc->ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; #if defined(INET6) || defined(INET) sc->ifp->if_capabilities |= IFCAP_TSO; sc->ifp->if_capabilities |= IFCAP_LRO; sc->ifp->if_capabilities |= IFCAP_VLAN_HWTSO; #endif sc->ifp->if_capenable = sc->ifp->if_capabilities; sc->ifp->if_baudrate = IF_Gbps(10); #if __FreeBSD_version >= 1000000 sc->ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); sc->ifp->if_hw_tsomaxsegcount = OCE_MAX_TX_ELEMENTS; sc->ifp->if_hw_tsomaxsegsize = 4096; #endif ether_ifattach(sc->ifp, sc->macaddr.mac_addr); return 0; } static void oce_add_vlan(void *arg, struct ifnet *ifp, uint16_t vtag) { POCE_SOFTC sc = ifp->if_softc; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) return; sc->vlan_tag[vtag] = 1; sc->vlans_added++; if (sc->vlans_added <= (sc->max_vlans + 1)) oce_vid_config(sc); } static void oce_del_vlan(void *arg, struct ifnet *ifp, uint16_t vtag) { POCE_SOFTC sc = ifp->if_softc; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) return; sc->vlan_tag[vtag] = 0; sc->vlans_added--; oce_vid_config(sc); } /* * A max of 64 vlans can be configured in BE. If the user configures * more, place the card in vlan promiscuous mode. */ static int oce_vid_config(POCE_SOFTC sc) { struct normal_vlan vtags[MAX_VLANFILTER_SIZE]; uint16_t ntags = 0, i; int status = 0; if ((sc->vlans_added <= MAX_VLANFILTER_SIZE) && (sc->ifp->if_capenable & IFCAP_VLAN_HWFILTER)) { for (i = 0; i < MAX_VLANS; i++) { if (sc->vlan_tag[i]) { vtags[ntags].vtag = i; ntags++; } } if (ntags) status = oce_config_vlan(sc, (uint8_t) sc->if_id, vtags, ntags, 1, 0); } else status = oce_config_vlan(sc, (uint8_t) sc->if_id, NULL, 0, 1, 1); return status; } static void oce_mac_addr_set(POCE_SOFTC sc) { uint32_t old_pmac_id = sc->pmac_id; int status = 0; status = bcmp((IF_LLADDR(sc->ifp)), sc->macaddr.mac_addr, sc->macaddr.size_of_struct); if (!status) return; status = oce_mbox_macaddr_add(sc, (uint8_t *)(IF_LLADDR(sc->ifp)), sc->if_id, &sc->pmac_id); if (!status) { status = oce_mbox_macaddr_del(sc, sc->if_id, old_pmac_id); bcopy((IF_LLADDR(sc->ifp)), sc->macaddr.mac_addr, sc->macaddr.size_of_struct); } if (status) device_printf(sc->dev, "Failed update macaddress\n"); } static int oce_handle_passthrough(struct ifnet *ifp, caddr_t data) { POCE_SOFTC sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int rc = ENXIO; char cookie[32] = {0}; void *priv_data = (void *)ifr->ifr_data; void *ioctl_ptr; uint32_t req_size; struct mbx_hdr req; OCE_DMA_MEM dma_mem; struct mbx_common_get_cntl_attr *fw_cmd; if (copyin(priv_data, cookie, strlen(IOCTL_COOKIE))) return EFAULT; if (memcmp(cookie, IOCTL_COOKIE, strlen(IOCTL_COOKIE))) return EINVAL; ioctl_ptr = (char *)priv_data + strlen(IOCTL_COOKIE); if (copyin(ioctl_ptr, &req, sizeof(struct mbx_hdr))) return EFAULT; req_size = le32toh(req.u0.req.request_length); if (req_size > 65536) return EINVAL; req_size += sizeof(struct mbx_hdr); rc = oce_dma_alloc(sc, req_size, &dma_mem, 0); if (rc) return ENOMEM; if (copyin(ioctl_ptr, OCE_DMAPTR(&dma_mem,char), req_size)) { rc = EFAULT; goto dma_free; } rc = oce_pass_through_mbox(sc, &dma_mem, req_size); if (rc) { rc = EIO; goto dma_free; } if (copyout(OCE_DMAPTR(&dma_mem,char), ioctl_ptr, req_size)) rc = EFAULT; /* firmware is filling all the attributes for this ioctl except the driver version..so fill it */ if(req.u0.rsp.opcode == OPCODE_COMMON_GET_CNTL_ATTRIBUTES) { fw_cmd = (struct mbx_common_get_cntl_attr *) ioctl_ptr; strncpy(fw_cmd->params.rsp.cntl_attr_info.hba_attr.drv_ver_str, COMPONENT_REVISION, strlen(COMPONENT_REVISION)); } dma_free: oce_dma_free(sc, &dma_mem); return rc; } static void oce_eqd_set_periodic(POCE_SOFTC sc) { struct oce_set_eqd set_eqd[OCE_MAX_EQ]; struct oce_aic_obj *aic; struct oce_eq *eqo; uint64_t now = 0, delta; int eqd, i, num = 0; uint32_t ips = 0; int tps; for (i = 0 ; i < sc->neqs; i++) { eqo = sc->eq[i]; aic = &sc->aic_obj[i]; /* When setting the static eq delay from the user space */ if (!aic->enable) { eqd = aic->et_eqd; goto modify_eqd; } now = ticks; /* Over flow check */ if ((now < aic->ticks) || (eqo->intr < aic->intr_prev)) goto done; delta = now - aic->ticks; tps = delta/hz; /* Interrupt rate based on elapsed ticks */ if(tps) ips = (uint32_t)(eqo->intr - aic->intr_prev) / tps; if (ips > INTR_RATE_HWM) eqd = aic->cur_eqd + 20; else if (ips < INTR_RATE_LWM) eqd = aic->cur_eqd / 2; else goto done; if (eqd < 10) eqd = 0; /* Make sure that the eq delay is in the known range */ eqd = min(eqd, aic->max_eqd); eqd = max(eqd, aic->min_eqd); modify_eqd: if (eqd != aic->cur_eqd) { set_eqd[num].delay_multiplier = (eqd * 65)/100; set_eqd[num].eq_id = eqo->eq_id; aic->cur_eqd = eqd; num++; } done: aic->intr_prev = eqo->intr; aic->ticks = now; } /* Is there atleast one eq that needs to be modified? */ if(num) oce_mbox_eqd_modify_periodic(sc, set_eqd, num); } static void oce_detect_hw_error(POCE_SOFTC sc) { uint32_t ue_low = 0, ue_high = 0, ue_low_mask = 0, ue_high_mask = 0; uint32_t sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0; uint32_t i; if (sc->hw_error) return; if (IS_XE201(sc)) { sliport_status = OCE_READ_REG32(sc, db, SLIPORT_STATUS_OFFSET); if (sliport_status & SLIPORT_STATUS_ERR_MASK) { sliport_err1 = OCE_READ_REG32(sc, db, SLIPORT_ERROR1_OFFSET); sliport_err2 = OCE_READ_REG32(sc, db, SLIPORT_ERROR2_OFFSET); } } else { ue_low = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_LOW); ue_high = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_HIGH); ue_low_mask = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_LOW_MASK); ue_high_mask = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_HI_MASK); ue_low = (ue_low & ~ue_low_mask); ue_high = (ue_high & ~ue_high_mask); } /* On certain platforms BE hardware can indicate spurious UEs. * Allow the h/w to stop working completely in case of a real UE. * Hence not setting the hw_error for UE detection. */ if (sliport_status & SLIPORT_STATUS_ERR_MASK) { sc->hw_error = TRUE; device_printf(sc->dev, "Error detected in the card\n"); } if (sliport_status & SLIPORT_STATUS_ERR_MASK) { device_printf(sc->dev, "ERR: sliport status 0x%x\n", sliport_status); device_printf(sc->dev, "ERR: sliport error1 0x%x\n", sliport_err1); device_printf(sc->dev, "ERR: sliport error2 0x%x\n", sliport_err2); } if (ue_low) { for (i = 0; ue_low; ue_low >>= 1, i++) { if (ue_low & 1) device_printf(sc->dev, "UE: %s bit set\n", ue_status_low_desc[i]); } } if (ue_high) { for (i = 0; ue_high; ue_high >>= 1, i++) { if (ue_high & 1) device_printf(sc->dev, "UE: %s bit set\n", ue_status_hi_desc[i]); } } } static void oce_local_timer(void *arg) { POCE_SOFTC sc = arg; int i = 0; oce_detect_hw_error(sc); oce_refresh_nic_stats(sc); oce_refresh_queue_stats(sc); oce_mac_addr_set(sc); /* TX Watch Dog*/ for (i = 0; i < sc->nwqs; i++) oce_tx_restart(sc, sc->wq[i]); /* calculate and set the eq delay for optimal interrupt rate */ if (IS_BE(sc) || IS_SH(sc)) oce_eqd_set_periodic(sc); callout_reset(&sc->timer, hz, oce_local_timer, sc); } /* NOTE : This should only be called holding * DEVICE_LOCK. */ static void oce_if_deactivate(POCE_SOFTC sc) { int i, mtime = 0; int wait_req = 0; struct oce_rq *rq; struct oce_wq *wq; struct oce_eq *eq; sc->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); /*Wait for max of 400ms for TX completions to be done */ while (mtime < 400) { wait_req = 0; for_all_wq_queues(sc, wq, i) { if (wq->ring->num_used) { wait_req = 1; DELAY(1); break; } } mtime += 1; if (!wait_req) break; } /* Stop intrs and finish any bottom halves pending */ oce_hw_intr_disable(sc); /* Since taskqueue_drain takes a Gaint Lock, We should not acquire any other lock. So unlock device lock and require after completing taskqueue_drain. */ UNLOCK(&sc->dev_lock); for (i = 0; i < sc->intr_count; i++) { if (sc->intrs[i].tq != NULL) { taskqueue_drain(sc->intrs[i].tq, &sc->intrs[i].task); } } LOCK(&sc->dev_lock); /* Delete RX queue in card with flush param */ oce_stop_rx(sc); /* Invalidate any pending cq and eq entries*/ for_all_evnt_queues(sc, eq, i) oce_drain_eq(eq); for_all_rq_queues(sc, rq, i) oce_drain_rq_cq(rq); for_all_wq_queues(sc, wq, i) oce_drain_wq_cq(wq); /* But still we need to get MCC aync events. So enable intrs and also arm first EQ */ oce_hw_intr_enable(sc); oce_arm_eq(sc, sc->eq[0]->eq_id, 0, TRUE, FALSE); DELAY(10); } static void oce_if_activate(POCE_SOFTC sc) { struct oce_eq *eq; struct oce_rq *rq; struct oce_wq *wq; int i, rc = 0; sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; oce_hw_intr_disable(sc); oce_start_rx(sc); for_all_rq_queues(sc, rq, i) { rc = oce_start_rq(rq); if (rc) device_printf(sc->dev, "Unable to start RX\n"); } for_all_wq_queues(sc, wq, i) { rc = oce_start_wq(wq); if (rc) device_printf(sc->dev, "Unable to start TX\n"); } for_all_evnt_queues(sc, eq, i) oce_arm_eq(sc, eq->eq_id, 0, TRUE, FALSE); oce_hw_intr_enable(sc); } static void process_link_state(POCE_SOFTC sc, struct oce_async_cqe_link_state *acqe) { /* Update Link status */ if ((acqe->u0.s.link_status & ~ASYNC_EVENT_LOGICAL) == ASYNC_EVENT_LINK_UP) { sc->link_status = ASYNC_EVENT_LINK_UP; if_link_state_change(sc->ifp, LINK_STATE_UP); } else { sc->link_status = ASYNC_EVENT_LINK_DOWN; if_link_state_change(sc->ifp, LINK_STATE_DOWN); } } /* Handle the Completion Queue for the Mailbox/Async notifications */ uint16_t oce_mq_handler(void *arg) { struct oce_mq *mq = (struct oce_mq *)arg; POCE_SOFTC sc = mq->parent; struct oce_cq *cq = mq->cq; int num_cqes = 0, evt_type = 0, optype = 0; struct oce_mq_cqe *cqe; struct oce_async_cqe_link_state *acqe; struct oce_async_event_grp5_pvid_state *gcqe; struct oce_async_event_qnq *dbgcqe; bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_mq_cqe); while (cqe->u0.dw[3]) { DW_SWAP((uint32_t *) cqe, sizeof(oce_mq_cqe)); if (cqe->u0.s.async_event) { evt_type = cqe->u0.s.event_type; optype = cqe->u0.s.async_type; if (evt_type == ASYNC_EVENT_CODE_LINK_STATE) { /* Link status evt */ acqe = (struct oce_async_cqe_link_state *)cqe; process_link_state(sc, acqe); } else if ((evt_type == ASYNC_EVENT_GRP5) && (optype == ASYNC_EVENT_PVID_STATE)) { /* GRP5 PVID */ gcqe = (struct oce_async_event_grp5_pvid_state *)cqe; if (gcqe->enabled) sc->pvid = gcqe->tag & VLAN_VID_MASK; else sc->pvid = 0; } else if(evt_type == ASYNC_EVENT_CODE_DEBUG && optype == ASYNC_EVENT_DEBUG_QNQ) { dbgcqe = (struct oce_async_event_qnq *)cqe; if(dbgcqe->valid) sc->qnqid = dbgcqe->vlan_tag; sc->qnq_debug_event = TRUE; } } cqe->u0.dw[3] = 0; RING_GET(cq->ring, 1); bus_dmamap_sync(cq->ring->dma.tag, cq->ring->dma.map, BUS_DMASYNC_POSTWRITE); cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_mq_cqe); num_cqes++; } if (num_cqes) oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE); return 0; } static void setup_max_queues_want(POCE_SOFTC sc) { /* Check if it is FLEX machine. Is so dont use RSS */ if ((sc->function_mode & FNM_FLEX10_MODE) || (sc->function_mode & FNM_UMC_MODE) || (sc->function_mode & FNM_VNIC_MODE) || (!is_rss_enabled(sc)) || IS_BE2(sc)) { sc->nrqs = 1; sc->nwqs = 1; } else { sc->nrqs = MIN(OCE_NCPUS, sc->nrssqs) + 1; sc->nwqs = MIN(OCE_NCPUS, sc->nrssqs); } if (IS_BE2(sc) && is_rss_enabled(sc)) sc->nrqs = MIN(OCE_NCPUS, sc->nrssqs) + 1; } static void update_queues_got(POCE_SOFTC sc) { if (is_rss_enabled(sc)) { sc->nrqs = sc->intr_count + 1; sc->nwqs = sc->intr_count; } else { sc->nrqs = 1; sc->nwqs = 1; } if (IS_BE2(sc)) sc->nwqs = 1; } static int oce_check_ipv6_ext_hdr(struct mbuf *m) { struct ether_header *eh = mtod(m, struct ether_header *); caddr_t m_datatemp = m->m_data; if (eh->ether_type == htons(ETHERTYPE_IPV6)) { m->m_data += sizeof(struct ether_header); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); if((ip6->ip6_nxt != IPPROTO_TCP) && \ (ip6->ip6_nxt != IPPROTO_UDP)){ struct ip6_ext *ip6e = NULL; m->m_data += sizeof(struct ip6_hdr); ip6e = (struct ip6_ext *) mtod(m, struct ip6_ext *); if(ip6e->ip6e_len == 0xff) { m->m_data = m_datatemp; return TRUE; } } m->m_data = m_datatemp; } return FALSE; } static int is_be3_a1(POCE_SOFTC sc) { if((sc->flags & OCE_FLAGS_BE3) && ((sc->asic_revision & 0xFF) < 2)) { return TRUE; } return FALSE; } static struct mbuf * oce_insert_vlan_tag(POCE_SOFTC sc, struct mbuf *m, boolean_t *complete) { uint16_t vlan_tag = 0; if(!M_WRITABLE(m)) return NULL; /* Embed vlan tag in the packet if it is not part of it */ if(m->m_flags & M_VLANTAG) { vlan_tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag); m->m_flags &= ~M_VLANTAG; } /* if UMC, ignore vlan tag insertion and instead insert pvid */ if(sc->pvid) { if(!vlan_tag) vlan_tag = sc->pvid; *complete = FALSE; } if(vlan_tag) { m = ether_vlanencap(m, vlan_tag); } if(sc->qnqid) { m = ether_vlanencap(m, sc->qnqid); *complete = FALSE; } return m; } static int oce_tx_asic_stall_verify(POCE_SOFTC sc, struct mbuf *m) { if(is_be3_a1(sc) && IS_QNQ_OR_UMC(sc) && \ oce_check_ipv6_ext_hdr(m)) { return TRUE; } return FALSE; } static void oce_get_config(POCE_SOFTC sc) { int rc = 0; uint32_t max_rss = 0; if ((IS_BE(sc) || IS_SH(sc)) && (!sc->be3_native)) max_rss = OCE_LEGACY_MODE_RSS; else max_rss = OCE_MAX_RSS; if (!IS_BE(sc)) { rc = oce_get_profile_config(sc, max_rss); if (rc) { sc->nwqs = OCE_MAX_WQ; sc->nrssqs = max_rss; sc->nrqs = sc->nrssqs + 1; } } else { /* For BE3 don't rely on fw for determining the resources */ sc->nrssqs = max_rss; sc->nrqs = sc->nrssqs + 1; sc->nwqs = OCE_MAX_WQ; sc->max_vlans = MAX_VLANFILTER_SIZE; } } Index: head/sys/dev/qlxgbe/ql_isr.c =================================================================== --- head/sys/dev/qlxgbe/ql_isr.c (revision 275357) +++ head/sys/dev/qlxgbe/ql_isr.c (revision 275358) @@ -1,883 +1,883 @@ /* * Copyright (c) 2013-2014 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: ql_isr.c * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. */ #include __FBSDID("$FreeBSD$"); #include "ql_os.h" #include "ql_hw.h" #include "ql_def.h" #include "ql_inline.h" #include "ql_ver.h" #include "ql_glbl.h" #include "ql_dbg.h" static void qla_replenish_normal_rx(qla_host_t *ha, qla_sds_t *sdsp, uint32_t r_idx); static void qla_rcv_error(qla_host_t *ha) { ha->flags.stop_rcv = 1; ha->qla_initiate_recovery = 1; } /* * Name: qla_rx_intr * Function: Handles normal ethernet frames received */ static void qla_rx_intr(qla_host_t *ha, qla_sgl_rcv_t *sgc, uint32_t sds_idx) { qla_rx_buf_t *rxb; struct mbuf *mp = NULL, *mpf = NULL, *mpl = NULL; struct ifnet *ifp = ha->ifp; qla_sds_t *sdsp; struct ether_vlan_header *eh; uint32_t i, rem_len = 0; uint32_t r_idx = 0; qla_rx_ring_t *rx_ring; if (ha->hw.num_rds_rings > 1) r_idx = sds_idx; ha->hw.rds[r_idx].count++; sdsp = &ha->hw.sds[sds_idx]; rx_ring = &ha->rx_ring[r_idx]; for (i = 0; i < sgc->num_handles; i++) { rxb = &rx_ring->rx_buf[sgc->handle[i] & 0x7FFF]; QL_ASSERT(ha, (rxb != NULL), ("%s: [sds_idx]=[%d] rxb != NULL\n", __func__,\ sds_idx)); if ((rxb == NULL) || QL_ERR_INJECT(ha, INJCT_RX_RXB_INVAL)) { /* log the error */ device_printf(ha->pci_dev, "%s invalid rxb[%d, %d, 0x%04x]\n", __func__, sds_idx, i, sgc->handle[i]); qla_rcv_error(ha); return; } mp = rxb->m_head; if (i == 0) mpf = mp; QL_ASSERT(ha, (mp != NULL), ("%s: [sds_idx]=[%d] mp != NULL\n", __func__,\ sds_idx)); bus_dmamap_sync(ha->rx_tag, rxb->map, BUS_DMASYNC_POSTREAD); rxb->m_head = NULL; rxb->next = sdsp->rxb_free; sdsp->rxb_free = rxb; sdsp->rx_free++; if ((mp == NULL) || QL_ERR_INJECT(ha, INJCT_RX_MP_NULL)) { /* log the error */ device_printf(ha->pci_dev, "%s mp == NULL [%d, %d, 0x%04x]\n", __func__, sds_idx, i, sgc->handle[i]); qla_rcv_error(ha); return; } if (i == 0) { mpl = mpf = mp; mp->m_flags |= M_PKTHDR; mp->m_pkthdr.len = sgc->pkt_length; mp->m_pkthdr.rcvif = ifp; rem_len = mp->m_pkthdr.len; } else { mp->m_flags &= ~M_PKTHDR; mpl->m_next = mp; mpl = mp; rem_len = rem_len - mp->m_len; } } mpl->m_len = rem_len; eh = mtod(mpf, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { uint32_t *data = (uint32_t *)eh; mpf->m_pkthdr.ether_vtag = ntohs(eh->evl_tag); mpf->m_flags |= M_VLANTAG; *(data + 3) = *(data + 2); *(data + 2) = *(data + 1); *(data + 1) = *data; m_adj(mpf, ETHER_VLAN_ENCAP_LEN); } if (sgc->chksum_status == Q8_STAT_DESC_STATUS_CHKSUM_OK) { mpf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mpf->m_pkthdr.csum_data = 0xFFFF; } else { mpf->m_pkthdr.csum_flags = 0; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); mpf->m_pkthdr.flowid = sgc->rss_hash; - mpf->m_flags |= M_FLOWID; + M_HASHTYPE_SET(mpf, M_HASHTYPE_OPAQUE); (*ifp->if_input)(ifp, mpf); if (sdsp->rx_free > ha->std_replenish) qla_replenish_normal_rx(ha, sdsp, r_idx); return; } #define QLA_TCP_HDR_SIZE 20 #define QLA_TCP_TS_OPTION_SIZE 12 /* * Name: qla_lro_intr * Function: Handles normal ethernet frames received */ static int qla_lro_intr(qla_host_t *ha, qla_sgl_lro_t *sgc, uint32_t sds_idx) { qla_rx_buf_t *rxb; struct mbuf *mp = NULL, *mpf = NULL, *mpl = NULL; struct ifnet *ifp = ha->ifp; qla_sds_t *sdsp; struct ether_vlan_header *eh; uint32_t i, rem_len = 0, pkt_length, iplen; struct tcphdr *th; struct ip *ip = NULL; struct ip6_hdr *ip6 = NULL; uint16_t etype; uint32_t r_idx = 0; qla_rx_ring_t *rx_ring; if (ha->hw.num_rds_rings > 1) r_idx = sds_idx; ha->hw.rds[r_idx].count++; rx_ring = &ha->rx_ring[r_idx]; ha->lro_pkt_count++; sdsp = &ha->hw.sds[sds_idx]; pkt_length = sgc->payload_length + sgc->l4_offset; if (sgc->flags & Q8_LRO_COMP_TS) { pkt_length += QLA_TCP_HDR_SIZE + QLA_TCP_TS_OPTION_SIZE; } else { pkt_length += QLA_TCP_HDR_SIZE; } ha->lro_bytes += pkt_length; for (i = 0; i < sgc->num_handles; i++) { rxb = &rx_ring->rx_buf[sgc->handle[i] & 0x7FFF]; QL_ASSERT(ha, (rxb != NULL), ("%s: [sds_idx]=[%d] rxb != NULL\n", __func__,\ sds_idx)); if ((rxb == NULL) || QL_ERR_INJECT(ha, INJCT_LRO_RXB_INVAL)) { /* log the error */ device_printf(ha->pci_dev, "%s invalid rxb[%d, %d, 0x%04x]\n", __func__, sds_idx, i, sgc->handle[i]); qla_rcv_error(ha); return (0); } mp = rxb->m_head; if (i == 0) mpf = mp; QL_ASSERT(ha, (mp != NULL), ("%s: [sds_idx]=[%d] mp != NULL\n", __func__,\ sds_idx)); bus_dmamap_sync(ha->rx_tag, rxb->map, BUS_DMASYNC_POSTREAD); rxb->m_head = NULL; rxb->next = sdsp->rxb_free; sdsp->rxb_free = rxb; sdsp->rx_free++; if ((mp == NULL) || QL_ERR_INJECT(ha, INJCT_LRO_MP_NULL)) { /* log the error */ device_printf(ha->pci_dev, "%s mp == NULL [%d, %d, 0x%04x]\n", __func__, sds_idx, i, sgc->handle[i]); qla_rcv_error(ha); return (0); } if (i == 0) { mpl = mpf = mp; mp->m_flags |= M_PKTHDR; mp->m_pkthdr.len = pkt_length; mp->m_pkthdr.rcvif = ifp; rem_len = mp->m_pkthdr.len; } else { mp->m_flags &= ~M_PKTHDR; mpl->m_next = mp; mpl = mp; rem_len = rem_len - mp->m_len; } } mpl->m_len = rem_len; th = (struct tcphdr *)(mpf->m_data + sgc->l4_offset); if (sgc->flags & Q8_LRO_COMP_PUSH_BIT) th->th_flags |= TH_PUSH; m_adj(mpf, sgc->l2_offset); eh = mtod(mpf, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { uint32_t *data = (uint32_t *)eh; mpf->m_pkthdr.ether_vtag = ntohs(eh->evl_tag); mpf->m_flags |= M_VLANTAG; *(data + 3) = *(data + 2); *(data + 2) = *(data + 1); *(data + 1) = *data; m_adj(mpf, ETHER_VLAN_ENCAP_LEN); etype = ntohs(eh->evl_proto); } else { etype = ntohs(eh->evl_encap_proto); } if (etype == ETHERTYPE_IP) { ip = (struct ip *)(mpf->m_data + ETHER_HDR_LEN); iplen = (ip->ip_hl << 2) + (th->th_off << 2) + sgc->payload_length; ip->ip_len = htons(iplen); ha->ipv4_lro++; } else if (etype == ETHERTYPE_IPV6) { ip6 = (struct ip6_hdr *)(mpf->m_data + ETHER_HDR_LEN); iplen = (th->th_off << 2) + sgc->payload_length; ip6->ip6_plen = htons(iplen); ha->ipv6_lro++; } else { m_freem(mpf); if (sdsp->rx_free > ha->std_replenish) qla_replenish_normal_rx(ha, sdsp, r_idx); return 0; } mpf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mpf->m_pkthdr.csum_data = 0xFFFF; mpf->m_pkthdr.flowid = sgc->rss_hash; - mpf->m_flags |= M_FLOWID; + M_HASHTYPE_SET(mpf, M_HASHTYPE_OPAQUE); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); (*ifp->if_input)(ifp, mpf); if (sdsp->rx_free > ha->std_replenish) qla_replenish_normal_rx(ha, sdsp, r_idx); return (0); } static int qla_rcv_cont_sds(qla_host_t *ha, uint32_t sds_idx, uint32_t comp_idx, uint32_t dcount, uint16_t *handle, uint16_t *nhandles) { uint32_t i; uint16_t num_handles; q80_stat_desc_t *sdesc; uint32_t opcode; *nhandles = 0; dcount--; for (i = 0; i < dcount; i++) { comp_idx = (comp_idx + 1) & (NUM_STATUS_DESCRIPTORS-1); sdesc = (q80_stat_desc_t *) &ha->hw.sds[sds_idx].sds_ring_base[comp_idx]; opcode = Q8_STAT_DESC_OPCODE((sdesc->data[1])); if (!opcode) { device_printf(ha->pci_dev, "%s: opcode=0 %p %p\n", __func__, (void *)sdesc->data[0], (void *)sdesc->data[1]); return -1; } num_handles = Q8_SGL_STAT_DESC_NUM_HANDLES((sdesc->data[1])); if (!num_handles) { device_printf(ha->pci_dev, "%s: opcode=0 %p %p\n", __func__, (void *)sdesc->data[0], (void *)sdesc->data[1]); return -1; } if (QL_ERR_INJECT(ha, INJCT_NUM_HNDLE_INVALID)) num_handles = -1; switch (num_handles) { case 1: *handle++ = Q8_SGL_STAT_DESC_HANDLE1((sdesc->data[0])); break; case 2: *handle++ = Q8_SGL_STAT_DESC_HANDLE1((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE2((sdesc->data[0])); break; case 3: *handle++ = Q8_SGL_STAT_DESC_HANDLE1((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE2((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE3((sdesc->data[0])); break; case 4: *handle++ = Q8_SGL_STAT_DESC_HANDLE1((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE2((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE3((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE4((sdesc->data[0])); break; case 5: *handle++ = Q8_SGL_STAT_DESC_HANDLE1((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE2((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE3((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE4((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE5((sdesc->data[1])); break; case 6: *handle++ = Q8_SGL_STAT_DESC_HANDLE1((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE2((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE3((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE4((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE5((sdesc->data[1])); *handle++ = Q8_SGL_STAT_DESC_HANDLE6((sdesc->data[1])); break; case 7: *handle++ = Q8_SGL_STAT_DESC_HANDLE1((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE2((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE3((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE4((sdesc->data[0])); *handle++ = Q8_SGL_STAT_DESC_HANDLE5((sdesc->data[1])); *handle++ = Q8_SGL_STAT_DESC_HANDLE6((sdesc->data[1])); *handle++ = Q8_SGL_STAT_DESC_HANDLE7((sdesc->data[1])); break; default: device_printf(ha->pci_dev, "%s: invalid num handles %p %p\n", __func__, (void *)sdesc->data[0], (void *)sdesc->data[1]); QL_ASSERT(ha, (0),\ ("%s: %s [nh, sds, d0, d1]=[%d, %d, %p, %p]\n", __func__, "invalid num handles", sds_idx, num_handles, (void *)sdesc->data[0],(void *)sdesc->data[1])); qla_rcv_error(ha); return 0; } *nhandles = *nhandles + num_handles; } return 0; } /* * Name: qla_rcv_isr * Function: Main Interrupt Service Routine */ static uint32_t qla_rcv_isr(qla_host_t *ha, uint32_t sds_idx, uint32_t count) { device_t dev; qla_hw_t *hw; uint32_t comp_idx, c_idx = 0, desc_count = 0, opcode; volatile q80_stat_desc_t *sdesc, *sdesc0 = NULL; uint32_t ret = 0; qla_sgl_comp_t sgc; uint16_t nhandles; uint32_t sds_replenish_threshold = 0; dev = ha->pci_dev; hw = &ha->hw; hw->sds[sds_idx].rcv_active = 1; if (ha->flags.stop_rcv) { hw->sds[sds_idx].rcv_active = 0; return 0; } QL_DPRINT2(ha, (dev, "%s: [%d]enter\n", __func__, sds_idx)); /* * receive interrupts */ comp_idx = hw->sds[sds_idx].sdsr_next; while (count-- && !ha->flags.stop_rcv) { sdesc = (q80_stat_desc_t *) &hw->sds[sds_idx].sds_ring_base[comp_idx]; opcode = Q8_STAT_DESC_OPCODE((sdesc->data[1])); if (!opcode) break; hw->sds[sds_idx].intr_count++; switch (opcode) { case Q8_STAT_DESC_OPCODE_RCV_PKT: desc_count = 1; bzero(&sgc, sizeof(qla_sgl_comp_t)); sgc.rcv.pkt_length = Q8_STAT_DESC_TOTAL_LENGTH((sdesc->data[0])); sgc.rcv.num_handles = 1; sgc.rcv.handle[0] = Q8_STAT_DESC_HANDLE((sdesc->data[0])); sgc.rcv.chksum_status = Q8_STAT_DESC_STATUS((sdesc->data[1])); sgc.rcv.rss_hash = Q8_STAT_DESC_RSS_HASH((sdesc->data[0])); if (Q8_STAT_DESC_VLAN((sdesc->data[1]))) { sgc.rcv.vlan_tag = Q8_STAT_DESC_VLAN_ID((sdesc->data[1])); } qla_rx_intr(ha, &sgc.rcv, sds_idx); break; case Q8_STAT_DESC_OPCODE_SGL_RCV: desc_count = Q8_STAT_DESC_COUNT_SGL_RCV((sdesc->data[1])); if (desc_count > 1) { c_idx = (comp_idx + desc_count -1) & (NUM_STATUS_DESCRIPTORS-1); sdesc0 = (q80_stat_desc_t *) &hw->sds[sds_idx].sds_ring_base[c_idx]; if (Q8_STAT_DESC_OPCODE((sdesc0->data[1])) != Q8_STAT_DESC_OPCODE_CONT) { desc_count = 0; break; } } bzero(&sgc, sizeof(qla_sgl_comp_t)); sgc.rcv.pkt_length = Q8_STAT_DESC_TOTAL_LENGTH_SGL_RCV(\ (sdesc->data[0])); sgc.rcv.chksum_status = Q8_STAT_DESC_STATUS((sdesc->data[1])); sgc.rcv.rss_hash = Q8_STAT_DESC_RSS_HASH((sdesc->data[0])); if (Q8_STAT_DESC_VLAN((sdesc->data[1]))) { sgc.rcv.vlan_tag = Q8_STAT_DESC_VLAN_ID((sdesc->data[1])); } QL_ASSERT(ha, (desc_count <= 2) ,\ ("%s: [sds_idx, data0, data1]="\ "%d, %p, %p]\n", __func__, sds_idx,\ (void *)sdesc->data[0],\ (void *)sdesc->data[1])); sgc.rcv.num_handles = 1; sgc.rcv.handle[0] = Q8_STAT_DESC_HANDLE((sdesc->data[0])); if (qla_rcv_cont_sds(ha, sds_idx, comp_idx, desc_count, &sgc.rcv.handle[1], &nhandles)) { device_printf(dev, "%s: [sds_idx, dcount, data0, data1]=" "[%d, %d, 0x%llx, 0x%llx]\n", __func__, sds_idx, desc_count, (long long unsigned int)sdesc->data[0], (long long unsigned int)sdesc->data[1]); desc_count = 0; break; } sgc.rcv.num_handles += nhandles; qla_rx_intr(ha, &sgc.rcv, sds_idx); break; case Q8_STAT_DESC_OPCODE_SGL_LRO: desc_count = Q8_STAT_DESC_COUNT_SGL_LRO((sdesc->data[1])); if (desc_count > 1) { c_idx = (comp_idx + desc_count -1) & (NUM_STATUS_DESCRIPTORS-1); sdesc0 = (q80_stat_desc_t *) &hw->sds[sds_idx].sds_ring_base[c_idx]; if (Q8_STAT_DESC_OPCODE((sdesc0->data[1])) != Q8_STAT_DESC_OPCODE_CONT) { desc_count = 0; break; } } bzero(&sgc, sizeof(qla_sgl_comp_t)); sgc.lro.payload_length = Q8_STAT_DESC_TOTAL_LENGTH_SGL_RCV((sdesc->data[0])); sgc.lro.rss_hash = Q8_STAT_DESC_RSS_HASH((sdesc->data[0])); sgc.lro.num_handles = 1; sgc.lro.handle[0] = Q8_STAT_DESC_HANDLE((sdesc->data[0])); if (Q8_SGL_LRO_STAT_TS((sdesc->data[1]))) sgc.lro.flags |= Q8_LRO_COMP_TS; if (Q8_SGL_LRO_STAT_PUSH_BIT((sdesc->data[1]))) sgc.lro.flags |= Q8_LRO_COMP_PUSH_BIT; sgc.lro.l2_offset = Q8_SGL_LRO_STAT_L2_OFFSET((sdesc->data[1])); sgc.lro.l4_offset = Q8_SGL_LRO_STAT_L4_OFFSET((sdesc->data[1])); if (Q8_STAT_DESC_VLAN((sdesc->data[1]))) { sgc.lro.vlan_tag = Q8_STAT_DESC_VLAN_ID((sdesc->data[1])); } QL_ASSERT(ha, (desc_count <= 7) ,\ ("%s: [sds_idx, data0, data1]="\ "[%d, 0x%llx, 0x%llx]\n",\ __func__, sds_idx,\ (long long unsigned int)sdesc->data[0],\ (long long unsigned int)sdesc->data[1])); if (qla_rcv_cont_sds(ha, sds_idx, comp_idx, desc_count, &sgc.lro.handle[1], &nhandles)) { device_printf(dev, "%s: [sds_idx, data0, data1]="\ "[%d, 0x%llx, 0x%llx]\n",\ __func__, sds_idx,\ (long long unsigned int)sdesc->data[0],\ (long long unsigned int)sdesc->data[1]); desc_count = 0; break; } sgc.lro.num_handles += nhandles; if (qla_lro_intr(ha, &sgc.lro, sds_idx)) { device_printf(dev, "%s: [sds_idx, data0, data1]="\ "[%d, 0x%llx, 0x%llx]\n",\ __func__, sds_idx,\ (long long unsigned int)sdesc->data[0],\ (long long unsigned int)sdesc->data[1]); device_printf(dev, "%s: [comp_idx, c_idx, dcount, nhndls]="\ "[%d, %d, %d, %d]\n",\ __func__, comp_idx, c_idx, desc_count, sgc.lro.num_handles); if (desc_count > 1) { device_printf(dev, "%s: [sds_idx, data0, data1]="\ "[%d, 0x%llx, 0x%llx]\n",\ __func__, sds_idx,\ (long long unsigned int)sdesc0->data[0],\ (long long unsigned int)sdesc0->data[1]); } } break; default: device_printf(dev, "%s: default 0x%llx!\n", __func__, (long long unsigned int)sdesc->data[0]); break; } if (desc_count == 0) break; sds_replenish_threshold += desc_count; while (desc_count--) { sdesc->data[0] = 0ULL; sdesc->data[1] = 0ULL; comp_idx = (comp_idx + 1) & (NUM_STATUS_DESCRIPTORS-1); sdesc = (q80_stat_desc_t *) &hw->sds[sds_idx].sds_ring_base[comp_idx]; } if (sds_replenish_threshold > ha->hw.sds_cidx_thres) { sds_replenish_threshold = 0; if (hw->sds[sds_idx].sdsr_next != comp_idx) { QL_UPDATE_SDS_CONSUMER_INDEX(ha, sds_idx,\ comp_idx); } hw->sds[sds_idx].sdsr_next = comp_idx; } } if (ha->flags.stop_rcv) goto qla_rcv_isr_exit; if (hw->sds[sds_idx].sdsr_next != comp_idx) { QL_UPDATE_SDS_CONSUMER_INDEX(ha, sds_idx, comp_idx); } hw->sds[sds_idx].sdsr_next = comp_idx; sdesc = (q80_stat_desc_t *)&hw->sds[sds_idx].sds_ring_base[comp_idx]; opcode = Q8_STAT_DESC_OPCODE((sdesc->data[1])); if (opcode) ret = -1; qla_rcv_isr_exit: hw->sds[sds_idx].rcv_active = 0; return (ret); } void ql_mbx_isr(void *arg) { qla_host_t *ha; uint32_t data; uint32_t prev_link_state; ha = arg; if (ha == NULL) { device_printf(ha->pci_dev, "%s: arg == NULL\n", __func__); return; } data = READ_REG32(ha, Q8_FW_MBOX_CNTRL); if ((data & 0x3) != 0x1) { WRITE_REG32(ha, ha->hw.mbx_intr_mask_offset, 0); return; } data = READ_REG32(ha, Q8_FW_MBOX0); if ((data & 0xF000) != 0x8000) return; data = data & 0xFFFF; switch (data) { case 0x8001: /* It's an AEN */ ha->hw.cable_oui = READ_REG32(ha, (Q8_FW_MBOX0 + 4)); data = READ_REG32(ha, (Q8_FW_MBOX0 + 8)); ha->hw.cable_length = data & 0xFFFF; data = data >> 16; ha->hw.link_speed = data & 0xFFF; data = READ_REG32(ha, (Q8_FW_MBOX0 + 12)); prev_link_state = ha->hw.link_up; ha->hw.link_up = (((data & 0xFF) == 0) ? 0 : 1); if (prev_link_state != ha->hw.link_up) { if (ha->hw.link_up) if_link_state_change(ha->ifp, LINK_STATE_UP); else if_link_state_change(ha->ifp, LINK_STATE_DOWN); } ha->hw.module_type = ((data >> 8) & 0xFF); ha->hw.flags.fduplex = (((data & 0xFF0000) == 0) ? 0 : 1); ha->hw.flags.autoneg = (((data & 0xFF000000) == 0) ? 0 : 1); data = READ_REG32(ha, (Q8_FW_MBOX0 + 16)); ha->hw.flags.loopback_mode = data & 0x03; ha->hw.link_faults = (data >> 3) & 0xFF; WRITE_REG32(ha, Q8_FW_MBOX_CNTRL, 0x0); WRITE_REG32(ha, ha->hw.mbx_intr_mask_offset, 0x0); break; default: device_printf(ha->pci_dev, "%s: AEN[0x%08x]\n", __func__, data); WRITE_REG32(ha, Q8_FW_MBOX_CNTRL, 0x0); WRITE_REG32(ha, ha->hw.mbx_intr_mask_offset, 0x0); break; } return; } static void qla_replenish_normal_rx(qla_host_t *ha, qla_sds_t *sdsp, uint32_t r_idx) { qla_rx_buf_t *rxb; int count = sdsp->rx_free; uint32_t rx_next; qla_rdesc_t *rdesc; /* we can play with this value via a sysctl */ uint32_t replenish_thresh = ha->hw.rds_pidx_thres; rdesc = &ha->hw.rds[r_idx]; rx_next = rdesc->rx_next; while (count--) { rxb = sdsp->rxb_free; if (rxb == NULL) break; sdsp->rxb_free = rxb->next; sdsp->rx_free--; if (ql_get_mbuf(ha, rxb, NULL) == 0) { qla_set_hw_rcv_desc(ha, r_idx, rdesc->rx_in, rxb->handle, rxb->paddr, (rxb->m_head)->m_pkthdr.len); rdesc->rx_in++; if (rdesc->rx_in == NUM_RX_DESCRIPTORS) rdesc->rx_in = 0; rdesc->rx_next++; if (rdesc->rx_next == NUM_RX_DESCRIPTORS) rdesc->rx_next = 0; } else { device_printf(ha->pci_dev, "%s: ql_get_mbuf [0,(%d),(%d)] failed\n", __func__, rdesc->rx_in, rxb->handle); rxb->m_head = NULL; rxb->next = sdsp->rxb_free; sdsp->rxb_free = rxb; sdsp->rx_free++; break; } if (replenish_thresh-- == 0) { QL_UPDATE_RDS_PRODUCER_INDEX(ha, rdesc->prod_std, rdesc->rx_next); rx_next = rdesc->rx_next; replenish_thresh = ha->hw.rds_pidx_thres; } } if (rx_next != rdesc->rx_next) { QL_UPDATE_RDS_PRODUCER_INDEX(ha, rdesc->prod_std, rdesc->rx_next); } } void ql_isr(void *arg) { qla_ivec_t *ivec = arg; qla_host_t *ha ; int idx; qla_hw_t *hw; struct ifnet *ifp; uint32_t ret = 0; ha = ivec->ha; hw = &ha->hw; ifp = ha->ifp; if ((idx = ivec->sds_idx) >= ha->hw.num_sds_rings) return; if (idx == 0) taskqueue_enqueue(ha->tx_tq, &ha->tx_task); ret = qla_rcv_isr(ha, idx, -1); if (idx == 0) taskqueue_enqueue(ha->tx_tq, &ha->tx_task); if (!ha->flags.stop_rcv) { QL_ENABLE_INTERRUPTS(ha, idx); } return; } Index: head/sys/dev/qlxgbe/ql_os.c =================================================================== --- head/sys/dev/qlxgbe/ql_os.c (revision 275357) +++ head/sys/dev/qlxgbe/ql_os.c (revision 275358) @@ -1,1700 +1,1701 @@ /* * Copyright (c) 2013-2014 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * and ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: ql_os.c * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. */ #include __FBSDID("$FreeBSD$"); #include "ql_os.h" #include "ql_hw.h" #include "ql_def.h" #include "ql_inline.h" #include "ql_ver.h" #include "ql_glbl.h" #include "ql_dbg.h" #include /* * Some PCI Configuration Space Related Defines */ #ifndef PCI_VENDOR_QLOGIC #define PCI_VENDOR_QLOGIC 0x1077 #endif #ifndef PCI_PRODUCT_QLOGIC_ISP8030 #define PCI_PRODUCT_QLOGIC_ISP8030 0x8030 #endif #define PCI_QLOGIC_ISP8030 \ ((PCI_PRODUCT_QLOGIC_ISP8030 << 16) | PCI_VENDOR_QLOGIC) /* * static functions */ static int qla_alloc_parent_dma_tag(qla_host_t *ha); static void qla_free_parent_dma_tag(qla_host_t *ha); static int qla_alloc_xmt_bufs(qla_host_t *ha); static void qla_free_xmt_bufs(qla_host_t *ha); static int qla_alloc_rcv_bufs(qla_host_t *ha); static void qla_free_rcv_bufs(qla_host_t *ha); static void qla_clear_tx_buf(qla_host_t *ha, qla_tx_buf_t *txb); static void qla_init_ifnet(device_t dev, qla_host_t *ha); static int qla_sysctl_get_stats(SYSCTL_HANDLER_ARGS); static int qla_sysctl_get_link_status(SYSCTL_HANDLER_ARGS); static void qla_release(qla_host_t *ha); static void qla_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error); static void qla_stop(qla_host_t *ha); static int qla_send(qla_host_t *ha, struct mbuf **m_headp); static void qla_tx_done(void *context, int pending); static void qla_get_peer(qla_host_t *ha); static void qla_error_recovery(void *context, int pending); /* * Hooks to the Operating Systems */ static int qla_pci_probe (device_t); static int qla_pci_attach (device_t); static int qla_pci_detach (device_t); static void qla_init(void *arg); static int qla_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int qla_media_change(struct ifnet *ifp); static void qla_media_status(struct ifnet *ifp, struct ifmediareq *ifmr); static void qla_start(struct ifnet *ifp); static device_method_t qla_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, qla_pci_probe), DEVMETHOD(device_attach, qla_pci_attach), DEVMETHOD(device_detach, qla_pci_detach), { 0, 0 } }; static driver_t qla_pci_driver = { "ql", qla_pci_methods, sizeof (qla_host_t), }; static devclass_t qla83xx_devclass; DRIVER_MODULE(qla83xx, pci, qla_pci_driver, qla83xx_devclass, 0, 0); MODULE_DEPEND(qla83xx, pci, 1, 1, 1); MODULE_DEPEND(qla83xx, ether, 1, 1, 1); MALLOC_DEFINE(M_QLA83XXBUF, "qla83xxbuf", "Buffers for qla83xx driver"); #define QL_STD_REPLENISH_THRES 0 #define QL_JUMBO_REPLENISH_THRES 32 static char dev_str[64]; /* * Name: qla_pci_probe * Function: Validate the PCI device to be a QLA80XX device */ static int qla_pci_probe(device_t dev) { switch ((pci_get_device(dev) << 16) | (pci_get_vendor(dev))) { case PCI_QLOGIC_ISP8030: snprintf(dev_str, sizeof(dev_str), "%s v%d.%d.%d", "Qlogic ISP 83xx PCI CNA Adapter-Ethernet Function", QLA_VERSION_MAJOR, QLA_VERSION_MINOR, QLA_VERSION_BUILD); device_set_desc(dev, dev_str); break; default: return (ENXIO); } if (bootverbose) printf("%s: %s\n ", __func__, dev_str); return (BUS_PROBE_DEFAULT); } static void qla_add_sysctls(qla_host_t *ha) { device_t dev = ha->pci_dev; SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "stats", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_get_stats, "I", "Statistics"); SYSCTL_ADD_STRING(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fw_version", CTLFLAG_RD, ha->fw_ver_str, 0, "firmware version"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "link_status", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qla_sysctl_get_link_status, "I", "Link Status"); ha->dbg_level = 0; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "debug", CTLFLAG_RW, &ha->dbg_level, ha->dbg_level, "Debug Level"); ha->std_replenish = QL_STD_REPLENISH_THRES; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "std_replenish", CTLFLAG_RW, &ha->std_replenish, ha->std_replenish, "Threshold for Replenishing Standard Frames"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ipv4_lro", CTLFLAG_RD, &ha->ipv4_lro, "number of ipv4 lro completions"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ipv6_lro", CTLFLAG_RD, &ha->ipv6_lro, "number of ipv6 lro completions"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "tx_tso_frames", CTLFLAG_RD, &ha->tx_tso_frames, "number of Tx TSO Frames"); SYSCTL_ADD_QUAD(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "hw_vlan_tx_frames", CTLFLAG_RD, &ha->hw_vlan_tx_frames, "number of Tx VLAN Frames"); return; } static void qla_watchdog(void *arg) { qla_host_t *ha = arg; qla_hw_t *hw; struct ifnet *ifp; uint32_t i; qla_hw_tx_cntxt_t *hw_tx_cntxt; hw = &ha->hw; ifp = ha->ifp; if (ha->flags.qla_watchdog_exit) { ha->qla_watchdog_exited = 1; return; } ha->qla_watchdog_exited = 0; if (!ha->flags.qla_watchdog_pause) { if (ql_hw_check_health(ha) || ha->qla_initiate_recovery || (ha->msg_from_peer == QL_PEER_MSG_RESET)) { ha->qla_watchdog_paused = 1; ha->flags.qla_watchdog_pause = 1; ha->qla_initiate_recovery = 0; ha->err_inject = 0; taskqueue_enqueue(ha->err_tq, &ha->err_task); } else { for (i = 0; i < ha->hw.num_tx_rings; i++) { hw_tx_cntxt = &hw->tx_cntxt[i]; if (qla_le32_to_host(*(hw_tx_cntxt->tx_cons)) != hw_tx_cntxt->txr_comp) { taskqueue_enqueue(ha->tx_tq, &ha->tx_task); break; } } if ((ifp->if_snd.ifq_head != NULL) && QL_RUNNING(ifp)) { taskqueue_enqueue(ha->tx_tq, &ha->tx_task); } ha->qla_watchdog_paused = 0; } } else { ha->qla_watchdog_paused = 1; } ha->watchdog_ticks = ha->watchdog_ticks++ % 1000; callout_reset(&ha->tx_callout, QLA_WATCHDOG_CALLOUT_TICKS, qla_watchdog, ha); } /* * Name: qla_pci_attach * Function: attaches the device to the operating system */ static int qla_pci_attach(device_t dev) { qla_host_t *ha = NULL; uint32_t rsrc_len; int i; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "cannot get softc\n"); return (ENOMEM); } memset(ha, 0, sizeof (qla_host_t)); if (pci_get_device(dev) != PCI_PRODUCT_QLOGIC_ISP8030) { device_printf(dev, "device is not ISP8030\n"); return (ENXIO); } ha->pci_func = pci_get_function(dev); ha->pci_dev = dev; pci_enable_busmaster(dev); ha->reg_rid = PCIR_BAR(0); ha->pci_reg = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->reg_rid, RF_ACTIVE); if (ha->pci_reg == NULL) { device_printf(dev, "unable to map any ports\n"); goto qla_pci_attach_err; } rsrc_len = (uint32_t) bus_get_resource_count(dev, SYS_RES_MEMORY, ha->reg_rid); mtx_init(&ha->hw_lock, "qla83xx_hw_lock", MTX_NETWORK_LOCK, MTX_DEF); mtx_init(&ha->tx_lock, "qla83xx_tx_lock", MTX_NETWORK_LOCK, MTX_DEF); qla_add_sysctls(ha); ql_hw_add_sysctls(ha); ha->flags.lock_init = 1; ha->reg_rid1 = PCIR_BAR(2); ha->pci_reg1 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->reg_rid1, RF_ACTIVE); ha->msix_count = pci_msix_count(dev); if (ha->msix_count < (ha->hw.num_sds_rings + 1)) { device_printf(dev, "%s: msix_count[%d] not enough\n", __func__, ha->msix_count); goto qla_pci_attach_err; } QL_DPRINT2(ha, (dev, "%s: ha %p pci_func 0x%x rsrc_count 0x%08x" " msix_count 0x%x pci_reg %p\n", __func__, ha, ha->pci_func, rsrc_len, ha->msix_count, ha->pci_reg)); ha->msix_count = ha->hw.num_sds_rings + 1; if (pci_alloc_msix(dev, &ha->msix_count)) { device_printf(dev, "%s: pci_alloc_msi[%d] failed\n", __func__, ha->msix_count); ha->msix_count = 0; goto qla_pci_attach_err; } ha->mbx_irq_rid = 1; ha->mbx_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &ha->mbx_irq_rid, (RF_ACTIVE | RF_SHAREABLE)); if (ha->mbx_irq == NULL) { device_printf(dev, "could not allocate mbx interrupt\n"); goto qla_pci_attach_err; } if (bus_setup_intr(dev, ha->mbx_irq, (INTR_TYPE_NET | INTR_MPSAFE), NULL, ql_mbx_isr, ha, &ha->mbx_handle)) { device_printf(dev, "could not setup mbx interrupt\n"); goto qla_pci_attach_err; } for (i = 0; i < ha->hw.num_sds_rings; i++) { ha->irq_vec[i].sds_idx = i; ha->irq_vec[i].ha = ha; ha->irq_vec[i].irq_rid = 2 + i; ha->irq_vec[i].irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &ha->irq_vec[i].irq_rid, (RF_ACTIVE | RF_SHAREABLE)); if (ha->irq_vec[i].irq == NULL) { device_printf(dev, "could not allocate interrupt\n"); goto qla_pci_attach_err; } if (bus_setup_intr(dev, ha->irq_vec[i].irq, (INTR_TYPE_NET | INTR_MPSAFE), NULL, ql_isr, &ha->irq_vec[i], &ha->irq_vec[i].handle)) { device_printf(dev, "could not setup interrupt\n"); goto qla_pci_attach_err; } } printf("%s: mp__ncpus %d sds %d rds %d msi-x %d\n", __func__, mp_ncpus, ha->hw.num_sds_rings, ha->hw.num_rds_rings, ha->msix_count); /* initialize hardware */ if (ql_init_hw(ha)) { device_printf(dev, "%s: ql_init_hw failed\n", __func__); goto qla_pci_attach_err; } device_printf(dev, "%s: firmware[%d.%d.%d.%d]\n", __func__, ha->fw_ver_major, ha->fw_ver_minor, ha->fw_ver_sub, ha->fw_ver_build); snprintf(ha->fw_ver_str, sizeof(ha->fw_ver_str), "%d.%d.%d.%d", ha->fw_ver_major, ha->fw_ver_minor, ha->fw_ver_sub, ha->fw_ver_build); ql_read_mac_addr(ha); /* allocate parent dma tag */ if (qla_alloc_parent_dma_tag(ha)) { device_printf(dev, "%s: qla_alloc_parent_dma_tag failed\n", __func__); goto qla_pci_attach_err; } /* alloc all dma buffers */ if (ql_alloc_dma(ha)) { device_printf(dev, "%s: ql_alloc_dma failed\n", __func__); goto qla_pci_attach_err; } qla_get_peer(ha); /* create the o.s ethernet interface */ qla_init_ifnet(dev, ha); ha->flags.qla_watchdog_active = 1; ha->flags.qla_watchdog_pause = 1; TASK_INIT(&ha->tx_task, 0, qla_tx_done, ha); ha->tx_tq = taskqueue_create_fast("qla_txq", M_NOWAIT, taskqueue_thread_enqueue, &ha->tx_tq); taskqueue_start_threads(&ha->tx_tq, 1, PI_NET, "%s txq", device_get_nameunit(ha->pci_dev)); callout_init(&ha->tx_callout, TRUE); ha->flags.qla_callout_init = 1; /* create ioctl device interface */ if (ql_make_cdev(ha)) { device_printf(dev, "%s: ql_make_cdev failed\n", __func__); goto qla_pci_attach_err; } callout_reset(&ha->tx_callout, QLA_WATCHDOG_CALLOUT_TICKS, qla_watchdog, ha); TASK_INIT(&ha->err_task, 0, qla_error_recovery, ha); ha->err_tq = taskqueue_create_fast("qla_errq", M_NOWAIT, taskqueue_thread_enqueue, &ha->err_tq); taskqueue_start_threads(&ha->err_tq, 1, PI_NET, "%s errq", device_get_nameunit(ha->pci_dev)); QL_DPRINT2(ha, (dev, "%s: exit 0\n", __func__)); return (0); qla_pci_attach_err: qla_release(ha); QL_DPRINT2(ha, (dev, "%s: exit ENXIO\n", __func__)); return (ENXIO); } /* * Name: qla_pci_detach * Function: Unhooks the device from the operating system */ static int qla_pci_detach(device_t dev) { qla_host_t *ha = NULL; struct ifnet *ifp; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "cannot get softc\n"); return (ENOMEM); } ifp = ha->ifp; (void)QLA_LOCK(ha, __func__, 0); qla_stop(ha); QLA_UNLOCK(ha, __func__); qla_release(ha); QL_DPRINT2(ha, (dev, "%s: exit\n", __func__)); return (0); } /* * SYSCTL Related Callbacks */ static int qla_sysctl_get_stats(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; ql_get_stats(ha); } return (err); } static int qla_sysctl_get_link_status(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qla_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; ql_hw_link_status(ha); } return (err); } /* * Name: qla_release * Function: Releases the resources allocated for the device */ static void qla_release(qla_host_t *ha) { device_t dev; int i; dev = ha->pci_dev; if (ha->err_tq) { taskqueue_drain(ha->err_tq, &ha->err_task); taskqueue_free(ha->err_tq); } if (ha->tx_tq) { taskqueue_drain(ha->tx_tq, &ha->tx_task); taskqueue_free(ha->tx_tq); } ql_del_cdev(ha); if (ha->flags.qla_watchdog_active) { ha->flags.qla_watchdog_exit = 1; while (ha->qla_watchdog_exited == 0) qla_mdelay(__func__, 1); } if (ha->flags.qla_callout_init) callout_stop(&ha->tx_callout); if (ha->ifp != NULL) ether_ifdetach(ha->ifp); ql_free_dma(ha); qla_free_parent_dma_tag(ha); if (ha->mbx_handle) (void)bus_teardown_intr(dev, ha->mbx_irq, ha->mbx_handle); if (ha->mbx_irq) (void) bus_release_resource(dev, SYS_RES_IRQ, ha->mbx_irq_rid, ha->mbx_irq); for (i = 0; i < ha->hw.num_sds_rings; i++) { if (ha->irq_vec[i].handle) { (void)bus_teardown_intr(dev, ha->irq_vec[i].irq, ha->irq_vec[i].handle); } if (ha->irq_vec[i].irq) { (void)bus_release_resource(dev, SYS_RES_IRQ, ha->irq_vec[i].irq_rid, ha->irq_vec[i].irq); } } if (ha->msix_count) pci_release_msi(dev); if (ha->flags.lock_init) { mtx_destroy(&ha->tx_lock); mtx_destroy(&ha->hw_lock); } if (ha->pci_reg) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->reg_rid, ha->pci_reg); if (ha->pci_reg1) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->reg_rid1, ha->pci_reg1); } /* * DMA Related Functions */ static void qla_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { *((bus_addr_t *)arg) = 0; if (error) { printf("%s: bus_dmamap_load failed (%d)\n", __func__, error); return; } *((bus_addr_t *)arg) = segs[0].ds_addr; return; } int ql_alloc_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf) { int ret = 0; device_t dev; bus_addr_t b_addr; dev = ha->pci_dev; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); ret = bus_dma_tag_create( ha->parent_tag,/* parent */ dma_buf->alignment, ((bus_size_t)(1ULL << 32)),/* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ dma_buf->size, /* maxsize */ 1, /* nsegments */ dma_buf->size, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &dma_buf->dma_tag); if (ret) { device_printf(dev, "%s: could not create dma tag\n", __func__); goto ql_alloc_dmabuf_exit; } ret = bus_dmamem_alloc(dma_buf->dma_tag, (void **)&dma_buf->dma_b, (BUS_DMA_ZERO | BUS_DMA_COHERENT | BUS_DMA_NOWAIT), &dma_buf->dma_map); if (ret) { bus_dma_tag_destroy(dma_buf->dma_tag); device_printf(dev, "%s: bus_dmamem_alloc failed\n", __func__); goto ql_alloc_dmabuf_exit; } ret = bus_dmamap_load(dma_buf->dma_tag, dma_buf->dma_map, dma_buf->dma_b, dma_buf->size, qla_dmamap_callback, &b_addr, BUS_DMA_NOWAIT); if (ret || !b_addr) { bus_dma_tag_destroy(dma_buf->dma_tag); bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); ret = -1; goto ql_alloc_dmabuf_exit; } dma_buf->dma_addr = b_addr; ql_alloc_dmabuf_exit: QL_DPRINT2(ha, (dev, "%s: exit ret 0x%08x tag %p map %p b %p sz 0x%x\n", __func__, ret, (void *)dma_buf->dma_tag, (void *)dma_buf->dma_map, (void *)dma_buf->dma_b, dma_buf->size)); return ret; } void ql_free_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf) { bus_dmamap_unload(dma_buf->dma_tag, dma_buf->dma_map); bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); bus_dma_tag_destroy(dma_buf->dma_tag); } static int qla_alloc_parent_dma_tag(qla_host_t *ha) { int ret; device_t dev; dev = ha->pci_dev; /* * Allocate parent DMA Tag */ ret = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ 1,((bus_size_t)(1ULL << 32)),/* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ 0, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &ha->parent_tag); if (ret) { device_printf(dev, "%s: could not create parent dma tag\n", __func__); return (-1); } ha->flags.parent_tag = 1; return (0); } static void qla_free_parent_dma_tag(qla_host_t *ha) { if (ha->flags.parent_tag) { bus_dma_tag_destroy(ha->parent_tag); ha->flags.parent_tag = 0; } } /* * Name: qla_init_ifnet * Function: Creates the Network Device Interface and Registers it with the O.S */ static void qla_init_ifnet(device_t dev, qla_host_t *ha) { struct ifnet *ifp; QL_DPRINT2(ha, (dev, "%s: enter\n", __func__)); ifp = ha->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) panic("%s: cannot if_alloc()\n", device_get_nameunit(dev)); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); ifp->if_capabilities = IFCAP_LINKSTATE; ifp->if_init = qla_init; ifp->if_softc = ha; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = qla_ioctl; ifp->if_start = qla_start; IFQ_SET_MAXLEN(&ifp->if_snd, qla_get_ifq_snd_maxlen(ha)); ifp->if_snd.ifq_drv_maxlen = qla_get_ifq_snd_maxlen(ha); IFQ_SET_READY(&ifp->if_snd); ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; ether_ifattach(ifp, qla_get_mac_addr(ha)); ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_TSO4 | IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; ifp->if_capabilities |= IFCAP_VLAN_HWTSO; ifp->if_capenable = ifp->if_capabilities; ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifmedia_init(&ha->media, IFM_IMASK, qla_media_change, qla_media_status); ifmedia_add(&ha->media, (IFM_ETHER | qla_get_optics(ha) | IFM_FDX), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | IFM_AUTO), 0, NULL); ifmedia_set(&ha->media, (IFM_ETHER | IFM_AUTO)); QL_DPRINT2(ha, (dev, "%s: exit\n", __func__)); return; } static void qla_init_locked(qla_host_t *ha) { struct ifnet *ifp = ha->ifp; qla_stop(ha); if (qla_alloc_xmt_bufs(ha) != 0) return; if (qla_alloc_rcv_bufs(ha) != 0) return; bcopy(IF_LLADDR(ha->ifp), ha->hw.mac_addr, ETHER_ADDR_LEN); ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO; ha->flags.stop_rcv = 0; if (ql_init_hw_if(ha) == 0) { ifp = ha->ifp; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; ha->flags.qla_watchdog_pause = 0; ha->hw_vlan_tx_frames = 0; ha->tx_tso_frames = 0; } return; } static void qla_init(void *arg) { qla_host_t *ha; ha = (qla_host_t *)arg; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); (void)QLA_LOCK(ha, __func__, 0); qla_init_locked(ha); QLA_UNLOCK(ha, __func__); QL_DPRINT2(ha, (ha->pci_dev, "%s: exit\n", __func__)); } static int qla_set_multi(qla_host_t *ha, uint32_t add_multi) { uint8_t mta[Q8_MAX_NUM_MULTICAST_ADDRS * Q8_MAC_ADDR_LEN]; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = ha->ifp; int ret = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == Q8_MAX_NUM_MULTICAST_ADDRS) break; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * Q8_MAC_ADDR_LEN], Q8_MAC_ADDR_LEN); mcnt++; } if_maddr_runlock(ifp); if (QLA_LOCK(ha, __func__, 1) == 0) { ret = ql_hw_set_multi(ha, mta, mcnt, add_multi); QLA_UNLOCK(ha, __func__); } return (ret); } static int qla_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { int ret = 0; struct ifreq *ifr = (struct ifreq *)data; struct ifaddr *ifa = (struct ifaddr *)data; qla_host_t *ha; ha = (qla_host_t *)ifp->if_softc; switch (cmd) { case SIOCSIFADDR: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFADDR (0x%lx)\n", __func__, cmd)); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { (void)QLA_LOCK(ha, __func__, 0); qla_init_locked(ha); QLA_UNLOCK(ha, __func__); } QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFADDR (0x%lx) ipv4 [0x%08x]\n", __func__, cmd, ntohl(IA_SIN(ifa)->sin_addr.s_addr))); arp_ifinit(ifp, ifa); } else { ether_ioctl(ifp, cmd, data); } break; case SIOCSIFMTU: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFMTU (0x%lx)\n", __func__, cmd)); if (ifr->ifr_mtu > QLA_MAX_MTU) { ret = EINVAL; } else { (void) QLA_LOCK(ha, __func__, 0); ifp->if_mtu = ifr->ifr_mtu; ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { ret = ql_set_max_mtu(ha, ha->max_frame_size, ha->hw.rcv_cntxt_id); } if (ifp->if_mtu > ETHERMTU) ha->std_replenish = QL_JUMBO_REPLENISH_THRES; else ha->std_replenish = QL_STD_REPLENISH_THRES; QLA_UNLOCK(ha, __func__); if (ret) ret = EINVAL; } break; case SIOCSIFFLAGS: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFFLAGS (0x%lx)\n", __func__, cmd)); (void)QLA_LOCK(ha, __func__, 0); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ ha->if_flags) & IFF_PROMISC) { ret = ql_set_promisc(ha); } else if ((ifp->if_flags ^ ha->if_flags) & IFF_ALLMULTI) { ret = ql_set_allmulti(ha); } } else { qla_init_locked(ha); ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; ret = ql_set_max_mtu(ha, ha->max_frame_size, ha->hw.rcv_cntxt_id); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) qla_stop(ha); ha->if_flags = ifp->if_flags; } QLA_UNLOCK(ha, __func__); break; case SIOCADDMULTI: QL_DPRINT4(ha, (ha->pci_dev, "%s: %s (0x%lx)\n", __func__, "SIOCADDMULTI", cmd)); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (qla_set_multi(ha, 1)) ret = EINVAL; } break; case SIOCDELMULTI: QL_DPRINT4(ha, (ha->pci_dev, "%s: %s (0x%lx)\n", __func__, "SIOCDELMULTI", cmd)); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (qla_set_multi(ha, 0)) ret = EINVAL; } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFMEDIA/SIOCGIFMEDIA (0x%lx)\n", __func__, cmd)); ret = ifmedia_ioctl(ifp, ifr, &ha->media, cmd); break; case SIOCSIFCAP: { int mask = ifr->ifr_reqcap ^ ifp->if_capenable; QL_DPRINT4(ha, (ha->pci_dev, "%s: SIOCSIFCAP (0x%lx)\n", __func__, cmd)); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) qla_init(ha); VLAN_CAPABILITIES(ifp); break; } default: QL_DPRINT4(ha, (ha->pci_dev, "%s: default (0x%lx)\n", __func__, cmd)); ret = ether_ioctl(ifp, cmd, data); break; } return (ret); } static int qla_media_change(struct ifnet *ifp) { qla_host_t *ha; struct ifmedia *ifm; int ret = 0; ha = (qla_host_t *)ifp->if_softc; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); ifm = &ha->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) ret = EINVAL; QL_DPRINT2(ha, (ha->pci_dev, "%s: exit\n", __func__)); return (ret); } static void qla_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { qla_host_t *ha; ha = (qla_host_t *)ifp->if_softc; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; ql_update_link_state(ha); if (ha->hw.link_up) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= (IFM_FDX | qla_get_optics(ha)); } QL_DPRINT2(ha, (ha->pci_dev, "%s: exit (%s)\n", __func__,\ (ha->hw.link_up ? "link_up" : "link_down"))); return; } static void qla_start(struct ifnet *ifp) { struct mbuf *m_head; qla_host_t *ha = (qla_host_t *)ifp->if_softc; QL_DPRINT8(ha, (ha->pci_dev, "%s: enter\n", __func__)); if (!mtx_trylock(&ha->tx_lock)) { QL_DPRINT8(ha, (ha->pci_dev, "%s: mtx_trylock(&ha->tx_lock) failed\n", __func__)); return; } if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) { QL_DPRINT8(ha, (ha->pci_dev, "%s: !IFF_DRV_RUNNING\n", __func__)); QLA_TX_UNLOCK(ha); return; } if (!ha->watchdog_ticks) ql_update_link_state(ha); if (!ha->hw.link_up) { QL_DPRINT8(ha, (ha->pci_dev, "%s: link down\n", __func__)); QLA_TX_UNLOCK(ha); return; } while (ifp->if_snd.ifq_head != NULL) { IF_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) { QL_DPRINT8(ha, (ha->pci_dev, "%s: m_head == NULL\n", __func__)); break; } if (qla_send(ha, &m_head)) { if (m_head == NULL) break; QL_DPRINT8(ha, (ha->pci_dev, "%s: PREPEND\n", __func__)); ifp->if_drv_flags |= IFF_DRV_OACTIVE; IF_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); } QLA_TX_UNLOCK(ha); QL_DPRINT8(ha, (ha->pci_dev, "%s: exit\n", __func__)); return; } static int qla_send(qla_host_t *ha, struct mbuf **m_headp) { bus_dma_segment_t segs[QLA_MAX_SEGMENTS]; bus_dmamap_t map; int nsegs; int ret = -1; uint32_t tx_idx; struct mbuf *m_head = *m_headp; uint32_t txr_idx = ha->txr_idx; QL_DPRINT8(ha, (ha->pci_dev, "%s: enter\n", __func__)); - if (m_head->m_flags & M_FLOWID) + /* check if flowid is set */ + if (M_HASHTYPE_GET(m_head) != M_HASHTYPE_NONE) txr_idx = m_head->m_pkthdr.flowid & (ha->hw.num_tx_rings - 1); tx_idx = ha->hw.tx_cntxt[txr_idx].txr_next; map = ha->tx_ring[txr_idx].tx_buf[tx_idx].map; ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT); if (ret == EFBIG) { struct mbuf *m; QL_DPRINT8(ha, (ha->pci_dev, "%s: EFBIG [%d]\n", __func__, m_head->m_pkthdr.len)); m = m_defrag(m_head, M_NOWAIT); if (m == NULL) { ha->err_tx_defrag++; m_freem(m_head); *m_headp = NULL; device_printf(ha->pci_dev, "%s: m_defrag() = NULL [%d]\n", __func__, ret); return (ENOBUFS); } m_head = m; *m_headp = m_head; if ((ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT))) { ha->err_tx_dmamap_load++; device_printf(ha->pci_dev, "%s: bus_dmamap_load_mbuf_sg failed0[%d, %d]\n", __func__, ret, m_head->m_pkthdr.len); if (ret != ENOMEM) { m_freem(m_head); *m_headp = NULL; } return (ret); } } else if (ret) { ha->err_tx_dmamap_load++; device_printf(ha->pci_dev, "%s: bus_dmamap_load_mbuf_sg failed1[%d, %d]\n", __func__, ret, m_head->m_pkthdr.len); if (ret != ENOMEM) { m_freem(m_head); *m_headp = NULL; } return (ret); } QL_ASSERT(ha, (nsegs != 0), ("qla_send: empty packet")); bus_dmamap_sync(ha->tx_tag, map, BUS_DMASYNC_PREWRITE); if (!(ret = ql_hw_send(ha, segs, nsegs, tx_idx, m_head, txr_idx))) { ha->tx_ring[txr_idx].count++; ha->tx_ring[txr_idx].tx_buf[tx_idx].m_head = m_head; } else { if (ret == EINVAL) { if (m_head) m_freem(m_head); *m_headp = NULL; } } QL_DPRINT8(ha, (ha->pci_dev, "%s: exit\n", __func__)); return (ret); } static void qla_stop(qla_host_t *ha) { struct ifnet *ifp = ha->ifp; device_t dev; dev = ha->pci_dev; ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE | IFF_DRV_RUNNING); ha->flags.qla_watchdog_pause = 1; while (!ha->qla_watchdog_paused) qla_mdelay(__func__, 1); ha->flags.stop_rcv = 1; ql_hw_stop_rcv(ha); ql_del_hw_if(ha); qla_free_xmt_bufs(ha); qla_free_rcv_bufs(ha); return; } /* * Buffer Management Functions for Transmit and Receive Rings */ static int qla_alloc_xmt_bufs(qla_host_t *ha) { int ret = 0; uint32_t i, j; qla_tx_buf_t *txb; if (bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ QLA_MAX_TSO_FRAME_SIZE, /* maxsize */ QLA_MAX_SEGMENTS, /* nsegments */ PAGE_SIZE, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->tx_tag)) { device_printf(ha->pci_dev, "%s: tx_tag alloc failed\n", __func__); return (ENOMEM); } for (i = 0; i < ha->hw.num_tx_rings; i++) { bzero((void *)ha->tx_ring[i].tx_buf, (sizeof(qla_tx_buf_t) * NUM_TX_DESCRIPTORS)); } for (j = 0; j < ha->hw.num_tx_rings; j++) { for (i = 0; i < NUM_TX_DESCRIPTORS; i++) { txb = &ha->tx_ring[j].tx_buf[i]; if ((ret = bus_dmamap_create(ha->tx_tag, BUS_DMA_NOWAIT, &txb->map))) { ha->err_tx_dmamap_create++; device_printf(ha->pci_dev, "%s: bus_dmamap_create failed[%d]\n", __func__, ret); qla_free_xmt_bufs(ha); return (ret); } } } return 0; } /* * Release mbuf after it sent on the wire */ static void qla_clear_tx_buf(qla_host_t *ha, qla_tx_buf_t *txb) { QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); if (txb->m_head && txb->map) { bus_dmamap_unload(ha->tx_tag, txb->map); m_freem(txb->m_head); txb->m_head = NULL; } if (txb->map) bus_dmamap_destroy(ha->tx_tag, txb->map); QL_DPRINT2(ha, (ha->pci_dev, "%s: exit\n", __func__)); } static void qla_free_xmt_bufs(qla_host_t *ha) { int i, j; for (j = 0; j < ha->hw.num_tx_rings; j++) { for (i = 0; i < NUM_TX_DESCRIPTORS; i++) qla_clear_tx_buf(ha, &ha->tx_ring[j].tx_buf[i]); } if (ha->tx_tag != NULL) { bus_dma_tag_destroy(ha->tx_tag); ha->tx_tag = NULL; } for (i = 0; i < ha->hw.num_tx_rings; i++) { bzero((void *)ha->tx_ring[i].tx_buf, (sizeof(qla_tx_buf_t) * NUM_TX_DESCRIPTORS)); } return; } static int qla_alloc_rcv_std(qla_host_t *ha) { int i, j, k, r, ret = 0; qla_rx_buf_t *rxb; qla_rx_ring_t *rx_ring; for (r = 0; r < ha->hw.num_rds_rings; r++) { rx_ring = &ha->rx_ring[r]; for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &rx_ring->rx_buf[i]; ret = bus_dmamap_create(ha->rx_tag, BUS_DMA_NOWAIT, &rxb->map); if (ret) { device_printf(ha->pci_dev, "%s: dmamap[%d, %d] failed\n", __func__, r, i); for (k = 0; k < r; k++) { for (j = 0; j < NUM_RX_DESCRIPTORS; j++) { rxb = &ha->rx_ring[k].rx_buf[j]; bus_dmamap_destroy(ha->rx_tag, rxb->map); } } for (j = 0; j < i; j++) { bus_dmamap_destroy(ha->rx_tag, rx_ring->rx_buf[j].map); } goto qla_alloc_rcv_std_err; } } } qla_init_hw_rcv_descriptors(ha); for (r = 0; r < ha->hw.num_rds_rings; r++) { rx_ring = &ha->rx_ring[r]; for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &rx_ring->rx_buf[i]; rxb->handle = i; if (!(ret = ql_get_mbuf(ha, rxb, NULL))) { /* * set the physical address in the * corresponding descriptor entry in the * receive ring/queue for the hba */ qla_set_hw_rcv_desc(ha, r, i, rxb->handle, rxb->paddr, (rxb->m_head)->m_pkthdr.len); } else { device_printf(ha->pci_dev, "%s: ql_get_mbuf [%d, %d] failed\n", __func__, r, i); bus_dmamap_destroy(ha->rx_tag, rxb->map); goto qla_alloc_rcv_std_err; } } } return 0; qla_alloc_rcv_std_err: return (-1); } static void qla_free_rcv_std(qla_host_t *ha) { int i, r; qla_rx_buf_t *rxb; for (r = 0; r < ha->hw.num_rds_rings; r++) { for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &ha->rx_ring[r].rx_buf[i]; if (rxb->m_head != NULL) { bus_dmamap_unload(ha->rx_tag, rxb->map); bus_dmamap_destroy(ha->rx_tag, rxb->map); m_freem(rxb->m_head); rxb->m_head = NULL; } } } return; } static int qla_alloc_rcv_bufs(qla_host_t *ha) { int i, ret = 0; if (bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->rx_tag)) { device_printf(ha->pci_dev, "%s: rx_tag alloc failed\n", __func__); return (ENOMEM); } bzero((void *)ha->rx_ring, (sizeof(qla_rx_ring_t) * MAX_RDS_RINGS)); for (i = 0; i < ha->hw.num_sds_rings; i++) { ha->hw.sds[i].sdsr_next = 0; ha->hw.sds[i].rxb_free = NULL; ha->hw.sds[i].rx_free = 0; } ret = qla_alloc_rcv_std(ha); return (ret); } static void qla_free_rcv_bufs(qla_host_t *ha) { int i; qla_free_rcv_std(ha); if (ha->rx_tag != NULL) { bus_dma_tag_destroy(ha->rx_tag); ha->rx_tag = NULL; } bzero((void *)ha->rx_ring, (sizeof(qla_rx_ring_t) * MAX_RDS_RINGS)); for (i = 0; i < ha->hw.num_sds_rings; i++) { ha->hw.sds[i].sdsr_next = 0; ha->hw.sds[i].rxb_free = NULL; ha->hw.sds[i].rx_free = 0; } return; } int ql_get_mbuf(qla_host_t *ha, qla_rx_buf_t *rxb, struct mbuf *nmp) { register struct mbuf *mp = nmp; struct ifnet *ifp; int ret = 0; uint32_t offset; bus_dma_segment_t segs[1]; int nsegs; QL_DPRINT2(ha, (ha->pci_dev, "%s: enter\n", __func__)); ifp = ha->ifp; if (mp == NULL) { mp = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (mp == NULL) { ha->err_m_getcl++; ret = ENOBUFS; device_printf(ha->pci_dev, "%s: m_getcl failed\n", __func__); goto exit_ql_get_mbuf; } mp->m_len = mp->m_pkthdr.len = MCLBYTES; } else { mp->m_len = mp->m_pkthdr.len = MCLBYTES; mp->m_data = mp->m_ext.ext_buf; mp->m_next = NULL; } offset = (uint32_t)((unsigned long long)mp->m_data & 0x7ULL); if (offset) { offset = 8 - offset; m_adj(mp, offset); } /* * Using memory from the mbuf cluster pool, invoke the bus_dma * machinery to arrange the memory mapping. */ ret = bus_dmamap_load_mbuf_sg(ha->rx_tag, rxb->map, mp, segs, &nsegs, BUS_DMA_NOWAIT); rxb->paddr = segs[0].ds_addr; if (ret || !rxb->paddr || (nsegs != 1)) { m_free(mp); rxb->m_head = NULL; device_printf(ha->pci_dev, "%s: bus_dmamap_load failed[%d, 0x%016llx, %d]\n", __func__, ret, (long long unsigned int)rxb->paddr, nsegs); ret = -1; goto exit_ql_get_mbuf; } rxb->m_head = mp; bus_dmamap_sync(ha->rx_tag, rxb->map, BUS_DMASYNC_PREREAD); exit_ql_get_mbuf: QL_DPRINT2(ha, (ha->pci_dev, "%s: exit ret = 0x%08x\n", __func__, ret)); return (ret); } static void qla_tx_done(void *context, int pending) { qla_host_t *ha = context; struct ifnet *ifp; ifp = ha->ifp; if (!ifp) return; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { QL_DPRINT8(ha, (ha->pci_dev, "%s: !IFF_DRV_RUNNING\n", __func__)); return; } ql_hw_tx_done(ha); qla_start(ha->ifp); } static void qla_get_peer(qla_host_t *ha) { device_t *peers; int count, i, slot; int my_slot = pci_get_slot(ha->pci_dev); if (device_get_children(device_get_parent(ha->pci_dev), &peers, &count)) return; for (i = 0; i < count; i++) { slot = pci_get_slot(peers[i]); if ((slot >= 0) && (slot == my_slot) && (pci_get_device(peers[i]) == pci_get_device(ha->pci_dev))) { if (ha->pci_dev != peers[i]) ha->peer_dev = peers[i]; } } } static void qla_send_msg_to_peer(qla_host_t *ha, uint32_t msg_to_peer) { qla_host_t *ha_peer; if (ha->peer_dev) { if ((ha_peer = device_get_softc(ha->peer_dev)) != NULL) { ha_peer->msg_from_peer = msg_to_peer; } } } static void qla_error_recovery(void *context, int pending) { qla_host_t *ha = context; uint32_t msecs_100 = 100; struct ifnet *ifp = ha->ifp; (void)QLA_LOCK(ha, __func__, 0); ha->flags.stop_rcv = 1; ql_hw_stop_rcv(ha); ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE | IFF_DRV_RUNNING); QLA_UNLOCK(ha, __func__); if ((ha->pci_func & 0x1) == 0) { if (!ha->msg_from_peer) { qla_send_msg_to_peer(ha, QL_PEER_MSG_RESET); while ((ha->msg_from_peer != QL_PEER_MSG_ACK) && msecs_100--) qla_mdelay(__func__, 100); } ha->msg_from_peer = 0; ql_minidump(ha); (void) ql_init_hw(ha); qla_free_xmt_bufs(ha); qla_free_rcv_bufs(ha); qla_send_msg_to_peer(ha, QL_PEER_MSG_ACK); } else { if (ha->msg_from_peer == QL_PEER_MSG_RESET) { ha->msg_from_peer = 0; qla_send_msg_to_peer(ha, QL_PEER_MSG_ACK); } else { qla_send_msg_to_peer(ha, QL_PEER_MSG_RESET); } while ((ha->msg_from_peer != QL_PEER_MSG_ACK) && msecs_100--) qla_mdelay(__func__, 100); ha->msg_from_peer = 0; (void) ql_init_hw(ha); qla_free_xmt_bufs(ha); qla_free_rcv_bufs(ha); } (void)QLA_LOCK(ha, __func__, 0); if (qla_alloc_xmt_bufs(ha) != 0) { QLA_UNLOCK(ha, __func__); return; } if (qla_alloc_rcv_bufs(ha) != 0) { QLA_UNLOCK(ha, __func__); return; } ha->flags.stop_rcv = 0; if (ql_init_hw_if(ha) == 0) { ifp = ha->ifp; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; ha->flags.qla_watchdog_pause = 0; } QLA_UNLOCK(ha, __func__); } Index: head/sys/dev/qlxge/qls_isr.c =================================================================== --- head/sys/dev/qlxge/qls_isr.c (revision 275357) +++ head/sys/dev/qlxge/qls_isr.c (revision 275358) @@ -1,401 +1,401 @@ /* * Copyright (c) 2013-2014 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: qls_isr.c * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. */ #include __FBSDID("$FreeBSD$"); #include "qls_os.h" #include "qls_hw.h" #include "qls_def.h" #include "qls_inline.h" #include "qls_ver.h" #include "qls_glbl.h" #include "qls_dbg.h" static void qls_tx_comp(qla_host_t *ha, uint32_t txr_idx, q81_tx_mac_comp_t *tx_comp) { qla_tx_buf_t *txb; uint32_t tx_idx = tx_comp->tid_lo; if (tx_idx >= NUM_TX_DESCRIPTORS) { ha->qla_initiate_recovery = 1; return; } txb = &ha->tx_ring[txr_idx].tx_buf[tx_idx]; if (txb->m_head) { if_inc_counter(ha->ifp, IFCOUNTER_OPACKETS, 1); bus_dmamap_sync(ha->tx_tag, txb->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(ha->tx_tag, txb->map); m_freem(txb->m_head); txb->m_head = NULL; } ha->tx_ring[txr_idx].txr_done++; if (ha->tx_ring[txr_idx].txr_done == NUM_TX_DESCRIPTORS) ha->tx_ring[txr_idx].txr_done = 0; } static void qls_replenish_rx(qla_host_t *ha, uint32_t r_idx) { qla_rx_buf_t *rxb; qla_rx_ring_t *rxr; int count; volatile q81_bq_addr_e_t *sbq_e; rxr = &ha->rx_ring[r_idx]; count = rxr->rx_free; sbq_e = rxr->sbq_vaddr; while (count--) { rxb = &rxr->rx_buf[rxr->sbq_next]; if (rxb->m_head == NULL) { if (qls_get_mbuf(ha, rxb, NULL) != 0) { device_printf(ha->pci_dev, "%s: qls_get_mbuf [0,%d,%d] failed\n", __func__, rxr->sbq_next, r_idx); rxb->m_head = NULL; break; } } if (rxb->m_head != NULL) { sbq_e[rxr->sbq_next].addr_lo = (uint32_t)rxb->paddr; sbq_e[rxr->sbq_next].addr_hi = (uint32_t)(rxb->paddr >> 32); rxr->sbq_next++; if (rxr->sbq_next == NUM_RX_DESCRIPTORS) rxr->sbq_next = 0; rxr->sbq_free++; rxr->rx_free--; } if (rxr->sbq_free == 16) { rxr->sbq_in += 16; rxr->sbq_in = rxr->sbq_in & (NUM_RX_DESCRIPTORS - 1); rxr->sbq_free = 0; Q81_WR_SBQ_PROD_IDX(r_idx, (rxr->sbq_in)); } } } static int qls_rx_comp(qla_host_t *ha, uint32_t rxr_idx, uint32_t cq_idx, q81_rx_t *cq_e) { qla_rx_buf_t *rxb; qla_rx_ring_t *rxr; device_t dev = ha->pci_dev; struct mbuf *mp = NULL; struct ifnet *ifp = ha->ifp; struct lro_ctrl *lro; struct ether_vlan_header *eh; rxr = &ha->rx_ring[rxr_idx]; lro = &rxr->lro; rxb = &rxr->rx_buf[rxr->rx_next]; if (!(cq_e->flags1 & Q81_RX_FLAGS1_DS)) { device_printf(dev, "%s: DS bit not set \n", __func__); return -1; } if (rxb->paddr != cq_e->b_paddr) { device_printf(dev, "%s: (rxb->paddr != cq_e->b_paddr)[%p, %p] \n", __func__, (void *)rxb->paddr, (void *)cq_e->b_paddr); Q81_SET_CQ_INVALID(cq_idx); ha->qla_initiate_recovery = 1; return(-1); } rxr->rx_int++; if ((cq_e->flags1 & Q81_RX_FLAGS1_ERR_MASK) == 0) { mp = rxb->m_head; rxb->m_head = NULL; if (mp == NULL) { device_printf(dev, "%s: mp == NULL\n", __func__); } else { mp->m_flags |= M_PKTHDR; mp->m_pkthdr.len = cq_e->length; mp->m_pkthdr.rcvif = ifp; mp->m_len = cq_e->length; eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { uint32_t *data = (uint32_t *)eh; mp->m_pkthdr.ether_vtag = ntohs(eh->evl_tag); mp->m_flags |= M_VLANTAG; *(data + 3) = *(data + 2); *(data + 2) = *(data + 1); *(data + 1) = *data; m_adj(mp, ETHER_VLAN_ENCAP_LEN); } if ((cq_e->flags1 & Q81_RX_FLAGS1_RSS_MATCH_MASK)) { rxr->rss_int++; mp->m_pkthdr.flowid = cq_e->rss; - mp->m_flags |= M_FLOWID; + M_HASHTYPE_SET(mp, M_HASHTYPE_OPAQUE); } if (cq_e->flags0 & (Q81_RX_FLAGS0_TE | Q81_RX_FLAGS0_NU | Q81_RX_FLAGS0_IE)) { mp->m_pkthdr.csum_flags = 0; } else { mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mp->m_pkthdr.csum_data = 0xFFFF; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if (lro->lro_cnt && (tcp_lro_rx(lro, mp, 0) == 0)) { /* LRO packet has been successfuly queued */ } else { (*ifp->if_input)(ifp, mp); } } } else { device_printf(dev, "%s: err [0%08x]\n", __func__, cq_e->flags1); } rxr->rx_free++; rxr->rx_next++; if (rxr->rx_next == NUM_RX_DESCRIPTORS) rxr->rx_next = 0; if ((rxr->rx_free + rxr->sbq_free) >= 16) qls_replenish_rx(ha, rxr_idx); return 0; } static void qls_cq_isr(qla_host_t *ha, uint32_t cq_idx) { q81_cq_e_t *cq_e, *cq_b; uint32_t i, cq_comp_idx; int ret = 0, tx_comp_done = 0; struct lro_ctrl *lro; struct lro_entry *queued; cq_b = ha->rx_ring[cq_idx].cq_base_vaddr; lro = &ha->rx_ring[cq_idx].lro; cq_comp_idx = *(ha->rx_ring[cq_idx].cqi_vaddr); i = ha->rx_ring[cq_idx].cq_next; while (i != cq_comp_idx) { cq_e = &cq_b[i]; switch (cq_e->opcode) { case Q81_IOCB_TX_MAC: case Q81_IOCB_TX_TSO: qls_tx_comp(ha, cq_idx, (q81_tx_mac_comp_t *)cq_e); tx_comp_done++; break; case Q81_IOCB_RX: ret = qls_rx_comp(ha, cq_idx, i, (q81_rx_t *)cq_e); break; case Q81_IOCB_MPI: case Q81_IOCB_SYS: default: device_printf(ha->pci_dev, "%s[%d %d 0x%x]: illegal \n", __func__, i, (*(ha->rx_ring[cq_idx].cqi_vaddr)), cq_e->opcode); qls_dump_buf32(ha, __func__, cq_e, (sizeof (q81_cq_e_t) >> 2)); break; } i++; if (i == NUM_CQ_ENTRIES) i = 0; if (ret) { break; } if (i == cq_comp_idx) { cq_comp_idx = *(ha->rx_ring[cq_idx].cqi_vaddr); } if (tx_comp_done) { taskqueue_enqueue(ha->tx_tq, &ha->tx_task); tx_comp_done = 0; } } while((!SLIST_EMPTY(&lro->lro_active))) { queued = SLIST_FIRST(&lro->lro_active); SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } ha->rx_ring[cq_idx].cq_next = cq_comp_idx; if (!ret) { Q81_WR_CQ_CONS_IDX(cq_idx, (ha->rx_ring[cq_idx].cq_next)); } if (tx_comp_done) taskqueue_enqueue(ha->tx_tq, &ha->tx_task); return; } static void qls_mbx_isr(qla_host_t *ha) { uint32_t data; int i; device_t dev = ha->pci_dev; if (qls_mbx_rd_reg(ha, 0, &data) == 0) { if ((data & 0xF000) == 0x4000) { ha->mbox[0] = data; for (i = 1; i < Q81_NUM_MBX_REGISTERS; i++) { if (qls_mbx_rd_reg(ha, i, &data)) break; ha->mbox[i] = data; } ha->mbx_done = 1; } else if ((data & 0xF000) == 0x8000) { /* we have an AEN */ ha->aen[0] = data; for (i = 1; i < Q81_NUM_AEN_REGISTERS; i++) { if (qls_mbx_rd_reg(ha, i, &data)) break; ha->aen[i] = data; } device_printf(dev,"%s: AEN " "[0x%08x 0x%08x 0x%08x 0x%08x 0x%08x" " 0x%08x 0x%08x 0x%08x 0x%08x]\n", __func__, ha->aen[0], ha->aen[1], ha->aen[2], ha->aen[3], ha->aen[4], ha->aen[5], ha->aen[6], ha->aen[7], ha->aen[8]); switch ((ha->aen[0] & 0xFFFF)) { case 0x8011: ha->link_up = 1; break; case 0x8012: ha->link_up = 0; break; case 0x8130: ha->link_hw_info = ha->aen[1]; break; case 0x8131: ha->link_hw_info = 0; break; } } } WRITE_REG32(ha, Q81_CTL_HOST_CMD_STATUS, Q81_CTL_HCS_CMD_CLR_RTH_INTR); return; } void qls_isr(void *arg) { qla_ivec_t *ivec = arg; qla_host_t *ha; uint32_t status; uint32_t cq_idx; device_t dev; ha = ivec->ha; cq_idx = ivec->cq_idx; dev = ha->pci_dev; status = READ_REG32(ha, Q81_CTL_STATUS); if (status & Q81_CTL_STATUS_FE) { device_printf(dev, "%s fatal error\n", __func__); return; } if ((cq_idx == 0) && (status & Q81_CTL_STATUS_PI)) { qls_mbx_isr(ha); } status = READ_REG32(ha, Q81_CTL_INTR_STATUS1); if (status & ( 0x1 << cq_idx)) qls_cq_isr(ha, cq_idx); Q81_ENABLE_INTR(ha, cq_idx); return; } Index: head/sys/dev/qlxge/qls_os.c =================================================================== --- head/sys/dev/qlxge/qls_os.c (revision 275357) +++ head/sys/dev/qlxge/qls_os.c (revision 275358) @@ -1,1531 +1,1532 @@ /* * Copyright (c) 2013-2014 Qlogic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * and ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: qls_os.c * Author : David C Somayajulu, Qlogic Corporation, Aliso Viejo, CA 92656. */ #include __FBSDID("$FreeBSD$"); #include "qls_os.h" #include "qls_hw.h" #include "qls_def.h" #include "qls_inline.h" #include "qls_ver.h" #include "qls_glbl.h" #include "qls_dbg.h" #include /* * Some PCI Configuration Space Related Defines */ #ifndef PCI_VENDOR_QLOGIC #define PCI_VENDOR_QLOGIC 0x1077 #endif #ifndef PCI_DEVICE_QLOGIC_8000 #define PCI_DEVICE_QLOGIC_8000 0x8000 #endif #define PCI_QLOGIC_DEV8000 \ ((PCI_DEVICE_QLOGIC_8000 << 16) | PCI_VENDOR_QLOGIC) /* * static functions */ static int qls_alloc_parent_dma_tag(qla_host_t *ha); static void qls_free_parent_dma_tag(qla_host_t *ha); static void qls_flush_xmt_bufs(qla_host_t *ha); static int qls_alloc_rcv_bufs(qla_host_t *ha); static void qls_free_rcv_bufs(qla_host_t *ha); static void qls_init_ifnet(device_t dev, qla_host_t *ha); static void qls_release(qla_host_t *ha); static void qls_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error); static void qls_stop(qla_host_t *ha); static int qls_send(qla_host_t *ha, struct mbuf **m_headp); static void qls_tx_done(void *context, int pending); static int qls_config_lro(qla_host_t *ha); static void qls_free_lro(qla_host_t *ha); static void qls_error_recovery(void *context, int pending); /* * Hooks to the Operating Systems */ static int qls_pci_probe (device_t); static int qls_pci_attach (device_t); static int qls_pci_detach (device_t); static void qls_start(struct ifnet *ifp); static void qls_init(void *arg); static int qls_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int qls_media_change(struct ifnet *ifp); static void qls_media_status(struct ifnet *ifp, struct ifmediareq *ifmr); static device_method_t qla_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, qls_pci_probe), DEVMETHOD(device_attach, qls_pci_attach), DEVMETHOD(device_detach, qls_pci_detach), { 0, 0 } }; static driver_t qla_pci_driver = { "ql", qla_pci_methods, sizeof (qla_host_t), }; static devclass_t qla8000_devclass; DRIVER_MODULE(qla8000, pci, qla_pci_driver, qla8000_devclass, 0, 0); MODULE_DEPEND(qla8000, pci, 1, 1, 1); MODULE_DEPEND(qla8000, ether, 1, 1, 1); MALLOC_DEFINE(M_QLA8000BUF, "qla8000buf", "Buffers for qla8000 driver"); static char dev_str[64]; static char ver_str[64]; /* * Name: qls_pci_probe * Function: Validate the PCI device to be a QLA80XX device */ static int qls_pci_probe(device_t dev) { switch ((pci_get_device(dev) << 16) | (pci_get_vendor(dev))) { case PCI_QLOGIC_DEV8000: snprintf(dev_str, sizeof(dev_str), "%s v%d.%d.%d", "Qlogic ISP 8000 PCI CNA Adapter-Ethernet Function", QLA_VERSION_MAJOR, QLA_VERSION_MINOR, QLA_VERSION_BUILD); snprintf(ver_str, sizeof(ver_str), "v%d.%d.%d", QLA_VERSION_MAJOR, QLA_VERSION_MINOR, QLA_VERSION_BUILD); device_set_desc(dev, dev_str); break; default: return (ENXIO); } if (bootverbose) printf("%s: %s\n ", __func__, dev_str); return (BUS_PROBE_DEFAULT); } static int qls_sysctl_get_drvr_stats(SYSCTL_HANDLER_ARGS) { int err = 0, ret; qla_host_t *ha; uint32_t i; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qla_host_t *)arg1; for (i = 0; i < ha->num_tx_rings; i++) { device_printf(ha->pci_dev, "%s: tx_ring[%d].tx_frames= %p\n", __func__, i, (void *)ha->tx_ring[i].tx_frames); device_printf(ha->pci_dev, "%s: tx_ring[%d].tx_tso_frames= %p\n", __func__, i, (void *)ha->tx_ring[i].tx_tso_frames); device_printf(ha->pci_dev, "%s: tx_ring[%d].tx_vlan_frames= %p\n", __func__, i, (void *)ha->tx_ring[i].tx_vlan_frames); device_printf(ha->pci_dev, "%s: tx_ring[%d].txr_free= 0x%08x\n", __func__, i, ha->tx_ring[i].txr_free); device_printf(ha->pci_dev, "%s: tx_ring[%d].txr_next= 0x%08x\n", __func__, i, ha->tx_ring[i].txr_next); device_printf(ha->pci_dev, "%s: tx_ring[%d].txr_done= 0x%08x\n", __func__, i, ha->tx_ring[i].txr_done); device_printf(ha->pci_dev, "%s: tx_ring[%d].txr_cons_idx= 0x%08x\n", __func__, i, *(ha->tx_ring[i].txr_cons_vaddr)); } for (i = 0; i < ha->num_rx_rings; i++) { device_printf(ha->pci_dev, "%s: rx_ring[%d].rx_int= %p\n", __func__, i, (void *)ha->rx_ring[i].rx_int); device_printf(ha->pci_dev, "%s: rx_ring[%d].rss_int= %p\n", __func__, i, (void *)ha->rx_ring[i].rss_int); device_printf(ha->pci_dev, "%s: rx_ring[%d].lbq_next= 0x%08x\n", __func__, i, ha->rx_ring[i].lbq_next); device_printf(ha->pci_dev, "%s: rx_ring[%d].lbq_free= 0x%08x\n", __func__, i, ha->rx_ring[i].lbq_free); device_printf(ha->pci_dev, "%s: rx_ring[%d].lbq_in= 0x%08x\n", __func__, i, ha->rx_ring[i].lbq_in); device_printf(ha->pci_dev, "%s: rx_ring[%d].sbq_next= 0x%08x\n", __func__, i, ha->rx_ring[i].sbq_next); device_printf(ha->pci_dev, "%s: rx_ring[%d].sbq_free= 0x%08x\n", __func__, i, ha->rx_ring[i].sbq_free); device_printf(ha->pci_dev, "%s: rx_ring[%d].sbq_in= 0x%08x\n", __func__, i, ha->rx_ring[i].sbq_in); } device_printf(ha->pci_dev, "%s: err_m_getcl = 0x%08x\n", __func__, ha->err_m_getcl); device_printf(ha->pci_dev, "%s: err_m_getjcl = 0x%08x\n", __func__, ha->err_m_getjcl); device_printf(ha->pci_dev, "%s: err_tx_dmamap_create = 0x%08x\n", __func__, ha->err_tx_dmamap_create); device_printf(ha->pci_dev, "%s: err_tx_dmamap_load = 0x%08x\n", __func__, ha->err_tx_dmamap_load); device_printf(ha->pci_dev, "%s: err_tx_defrag = 0x%08x\n", __func__, ha->err_tx_defrag); } return (err); } static void qls_add_sysctls(qla_host_t *ha) { device_t dev = ha->pci_dev; SYSCTL_ADD_STRING(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "version", CTLFLAG_RD, ver_str, 0, "Driver Version"); qls_dbg_level = 0; SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "debug", CTLFLAG_RW, &qls_dbg_level, qls_dbg_level, "Debug Level"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "drvr_stats", CTLTYPE_INT | CTLFLAG_RW, (void *)ha, 0, qls_sysctl_get_drvr_stats, "I", "Driver Maintained Statistics"); return; } static void qls_watchdog(void *arg) { qla_host_t *ha = arg; struct ifnet *ifp; ifp = ha->ifp; if (ha->flags.qla_watchdog_exit) { ha->qla_watchdog_exited = 1; return; } ha->qla_watchdog_exited = 0; if (!ha->flags.qla_watchdog_pause) { if (ha->qla_initiate_recovery) { ha->qla_watchdog_paused = 1; ha->qla_initiate_recovery = 0; ha->err_inject = 0; taskqueue_enqueue(ha->err_tq, &ha->err_task); } else if ((ifp->if_snd.ifq_head != NULL) && QL_RUNNING(ifp)) { taskqueue_enqueue(ha->tx_tq, &ha->tx_task); } ha->qla_watchdog_paused = 0; } else { ha->qla_watchdog_paused = 1; } ha->watchdog_ticks = ha->watchdog_ticks++ % 1000; callout_reset(&ha->tx_callout, QLA_WATCHDOG_CALLOUT_TICKS, qls_watchdog, ha); return; } /* * Name: qls_pci_attach * Function: attaches the device to the operating system */ static int qls_pci_attach(device_t dev) { qla_host_t *ha = NULL; int i; QL_DPRINT2((dev, "%s: enter\n", __func__)); if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "cannot get softc\n"); return (ENOMEM); } memset(ha, 0, sizeof (qla_host_t)); if (pci_get_device(dev) != PCI_DEVICE_QLOGIC_8000) { device_printf(dev, "device is not QLE8000\n"); return (ENXIO); } ha->pci_func = pci_get_function(dev); ha->pci_dev = dev; pci_enable_busmaster(dev); ha->reg_rid = PCIR_BAR(1); ha->pci_reg = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->reg_rid, RF_ACTIVE); if (ha->pci_reg == NULL) { device_printf(dev, "unable to map any ports\n"); goto qls_pci_attach_err; } ha->reg_rid1 = PCIR_BAR(3); ha->pci_reg1 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->reg_rid1, RF_ACTIVE); if (ha->pci_reg1 == NULL) { device_printf(dev, "unable to map any ports\n"); goto qls_pci_attach_err; } mtx_init(&ha->hw_lock, "qla80xx_hw_lock", MTX_NETWORK_LOCK, MTX_DEF); mtx_init(&ha->tx_lock, "qla80xx_tx_lock", MTX_NETWORK_LOCK, MTX_DEF); qls_add_sysctls(ha); qls_hw_add_sysctls(ha); ha->flags.lock_init = 1; ha->msix_count = pci_msix_count(dev); if (ha->msix_count < qls_get_msix_count(ha)) { device_printf(dev, "%s: msix_count[%d] not enough\n", __func__, ha->msix_count); goto qls_pci_attach_err; } ha->msix_count = qls_get_msix_count(ha); device_printf(dev, "\n%s: ha %p pci_func 0x%x msix_count 0x%x" " pci_reg %p pci_reg1 %p\n", __func__, ha, ha->pci_func, ha->msix_count, ha->pci_reg, ha->pci_reg1); if (pci_alloc_msix(dev, &ha->msix_count)) { device_printf(dev, "%s: pci_alloc_msi[%d] failed\n", __func__, ha->msix_count); ha->msix_count = 0; goto qls_pci_attach_err; } for (i = 0; i < ha->num_rx_rings; i++) { ha->irq_vec[i].cq_idx = i; ha->irq_vec[i].ha = ha; ha->irq_vec[i].irq_rid = 1 + i; ha->irq_vec[i].irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &ha->irq_vec[i].irq_rid, (RF_ACTIVE | RF_SHAREABLE)); if (ha->irq_vec[i].irq == NULL) { device_printf(dev, "could not allocate interrupt\n"); goto qls_pci_attach_err; } if (bus_setup_intr(dev, ha->irq_vec[i].irq, (INTR_TYPE_NET | INTR_MPSAFE), NULL, qls_isr, &ha->irq_vec[i], &ha->irq_vec[i].handle)) { device_printf(dev, "could not setup interrupt\n"); goto qls_pci_attach_err; } } qls_rd_nic_params(ha); /* allocate parent dma tag */ if (qls_alloc_parent_dma_tag(ha)) { device_printf(dev, "%s: qls_alloc_parent_dma_tag failed\n", __func__); goto qls_pci_attach_err; } /* alloc all dma buffers */ if (qls_alloc_dma(ha)) { device_printf(dev, "%s: qls_alloc_dma failed\n", __func__); goto qls_pci_attach_err; } /* create the o.s ethernet interface */ qls_init_ifnet(dev, ha); ha->flags.qla_watchdog_active = 1; ha->flags.qla_watchdog_pause = 1; TASK_INIT(&ha->tx_task, 0, qls_tx_done, ha); ha->tx_tq = taskqueue_create_fast("qla_txq", M_NOWAIT, taskqueue_thread_enqueue, &ha->tx_tq); taskqueue_start_threads(&ha->tx_tq, 1, PI_NET, "%s txq", device_get_nameunit(ha->pci_dev)); callout_init(&ha->tx_callout, TRUE); ha->flags.qla_callout_init = 1; /* create ioctl device interface */ if (qls_make_cdev(ha)) { device_printf(dev, "%s: qls_make_cdev failed\n", __func__); goto qls_pci_attach_err; } callout_reset(&ha->tx_callout, QLA_WATCHDOG_CALLOUT_TICKS, qls_watchdog, ha); TASK_INIT(&ha->err_task, 0, qls_error_recovery, ha); ha->err_tq = taskqueue_create_fast("qla_errq", M_NOWAIT, taskqueue_thread_enqueue, &ha->err_tq); taskqueue_start_threads(&ha->err_tq, 1, PI_NET, "%s errq", device_get_nameunit(ha->pci_dev)); QL_DPRINT2((dev, "%s: exit 0\n", __func__)); return (0); qls_pci_attach_err: qls_release(ha); QL_DPRINT2((dev, "%s: exit ENXIO\n", __func__)); return (ENXIO); } /* * Name: qls_pci_detach * Function: Unhooks the device from the operating system */ static int qls_pci_detach(device_t dev) { qla_host_t *ha = NULL; struct ifnet *ifp; QL_DPRINT2((dev, "%s: enter\n", __func__)); if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "cannot get softc\n"); return (ENOMEM); } ifp = ha->ifp; (void)QLA_LOCK(ha, __func__, 0); qls_stop(ha); QLA_UNLOCK(ha, __func__); qls_release(ha); QL_DPRINT2((dev, "%s: exit\n", __func__)); return (0); } /* * Name: qls_release * Function: Releases the resources allocated for the device */ static void qls_release(qla_host_t *ha) { device_t dev; int i; dev = ha->pci_dev; if (ha->err_tq) { taskqueue_drain(ha->err_tq, &ha->err_task); taskqueue_free(ha->err_tq); } if (ha->tx_tq) { taskqueue_drain(ha->tx_tq, &ha->tx_task); taskqueue_free(ha->tx_tq); } qls_del_cdev(ha); if (ha->flags.qla_watchdog_active) { ha->flags.qla_watchdog_exit = 1; while (ha->qla_watchdog_exited == 0) qls_mdelay(__func__, 1); } if (ha->flags.qla_callout_init) callout_stop(&ha->tx_callout); if (ha->ifp != NULL) ether_ifdetach(ha->ifp); qls_free_dma(ha); qls_free_parent_dma_tag(ha); for (i = 0; i < ha->num_rx_rings; i++) { if (ha->irq_vec[i].handle) { (void)bus_teardown_intr(dev, ha->irq_vec[i].irq, ha->irq_vec[i].handle); } if (ha->irq_vec[i].irq) { (void)bus_release_resource(dev, SYS_RES_IRQ, ha->irq_vec[i].irq_rid, ha->irq_vec[i].irq); } } if (ha->msix_count) pci_release_msi(dev); if (ha->flags.lock_init) { mtx_destroy(&ha->tx_lock); mtx_destroy(&ha->hw_lock); } if (ha->pci_reg) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->reg_rid, ha->pci_reg); if (ha->pci_reg1) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->reg_rid1, ha->pci_reg1); } /* * DMA Related Functions */ static void qls_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { *((bus_addr_t *)arg) = 0; if (error) { printf("%s: bus_dmamap_load failed (%d)\n", __func__, error); return; } *((bus_addr_t *)arg) = segs[0].ds_addr; return; } int qls_alloc_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf) { int ret = 0; device_t dev; bus_addr_t b_addr; dev = ha->pci_dev; QL_DPRINT2((dev, "%s: enter\n", __func__)); ret = bus_dma_tag_create( ha->parent_tag,/* parent */ dma_buf->alignment, ((bus_size_t)(1ULL << 32)),/* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ dma_buf->size, /* maxsize */ 1, /* nsegments */ dma_buf->size, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &dma_buf->dma_tag); if (ret) { device_printf(dev, "%s: could not create dma tag\n", __func__); goto qls_alloc_dmabuf_exit; } ret = bus_dmamem_alloc(dma_buf->dma_tag, (void **)&dma_buf->dma_b, (BUS_DMA_ZERO | BUS_DMA_COHERENT | BUS_DMA_NOWAIT), &dma_buf->dma_map); if (ret) { bus_dma_tag_destroy(dma_buf->dma_tag); device_printf(dev, "%s: bus_dmamem_alloc failed\n", __func__); goto qls_alloc_dmabuf_exit; } ret = bus_dmamap_load(dma_buf->dma_tag, dma_buf->dma_map, dma_buf->dma_b, dma_buf->size, qls_dmamap_callback, &b_addr, BUS_DMA_NOWAIT); if (ret || !b_addr) { bus_dma_tag_destroy(dma_buf->dma_tag); bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); ret = -1; goto qls_alloc_dmabuf_exit; } dma_buf->dma_addr = b_addr; qls_alloc_dmabuf_exit: QL_DPRINT2((dev, "%s: exit ret 0x%08x tag %p map %p b %p sz 0x%x\n", __func__, ret, (void *)dma_buf->dma_tag, (void *)dma_buf->dma_map, (void *)dma_buf->dma_b, dma_buf->size)); return ret; } void qls_free_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf) { bus_dmamap_unload(dma_buf->dma_tag, dma_buf->dma_map); bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); bus_dma_tag_destroy(dma_buf->dma_tag); } static int qls_alloc_parent_dma_tag(qla_host_t *ha) { int ret; device_t dev; dev = ha->pci_dev; /* * Allocate parent DMA Tag */ ret = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ 1,((bus_size_t)(1ULL << 32)),/* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ 0, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &ha->parent_tag); if (ret) { device_printf(dev, "%s: could not create parent dma tag\n", __func__); return (-1); } ha->flags.parent_tag = 1; return (0); } static void qls_free_parent_dma_tag(qla_host_t *ha) { if (ha->flags.parent_tag) { bus_dma_tag_destroy(ha->parent_tag); ha->flags.parent_tag = 0; } } /* * Name: qls_init_ifnet * Function: Creates the Network Device Interface and Registers it with the O.S */ static void qls_init_ifnet(device_t dev, qla_host_t *ha) { struct ifnet *ifp; QL_DPRINT2((dev, "%s: enter\n", __func__)); ifp = ha->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) panic("%s: cannot if_alloc()\n", device_get_nameunit(dev)); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); ifp->if_init = qls_init; ifp->if_softc = ha; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = qls_ioctl; ifp->if_start = qls_start; IFQ_SET_MAXLEN(&ifp->if_snd, qls_get_ifq_snd_maxlen(ha)); ifp->if_snd.ifq_drv_maxlen = qls_get_ifq_snd_maxlen(ha); IFQ_SET_READY(&ifp->if_snd); ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; if (ha->max_frame_size <= MCLBYTES) { ha->msize = MCLBYTES; } else if (ha->max_frame_size <= MJUMPAGESIZE) { ha->msize = MJUMPAGESIZE; } else ha->msize = MJUM9BYTES; ether_ifattach(ifp, qls_get_mac_addr(ha)); ifp->if_capabilities = IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_HWCSUM; ifp->if_capabilities |= IFCAP_VLAN_MTU; ifp->if_capabilities |= IFCAP_TSO4; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; ifp->if_capabilities |= IFCAP_VLAN_HWTSO; ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable = ifp->if_capabilities; ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifmedia_init(&ha->media, IFM_IMASK, qls_media_change, qls_media_status); ifmedia_add(&ha->media, (IFM_ETHER | qls_get_optics(ha) | IFM_FDX), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | IFM_AUTO), 0, NULL); ifmedia_set(&ha->media, (IFM_ETHER | IFM_AUTO)); QL_DPRINT2((dev, "%s: exit\n", __func__)); return; } static void qls_init_locked(qla_host_t *ha) { struct ifnet *ifp = ha->ifp; qls_stop(ha); qls_flush_xmt_bufs(ha); if (qls_alloc_rcv_bufs(ha) != 0) return; if (qls_config_lro(ha)) return; bcopy(IF_LLADDR(ha->ifp), ha->mac_addr, ETHER_ADDR_LEN); ifp->if_hwassist = CSUM_IP; ifp->if_hwassist |= CSUM_TCP; ifp->if_hwassist |= CSUM_UDP; ifp->if_hwassist |= CSUM_TSO; if (qls_init_hw_if(ha) == 0) { ifp = ha->ifp; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; ha->flags.qla_watchdog_pause = 0; } return; } static void qls_init(void *arg) { qla_host_t *ha; ha = (qla_host_t *)arg; QL_DPRINT2((ha->pci_dev, "%s: enter\n", __func__)); (void)QLA_LOCK(ha, __func__, 0); qls_init_locked(ha); QLA_UNLOCK(ha, __func__); QL_DPRINT2((ha->pci_dev, "%s: exit\n", __func__)); } static void qls_set_multi(qla_host_t *ha, uint32_t add_multi) { uint8_t mta[Q8_MAX_NUM_MULTICAST_ADDRS * Q8_MAC_ADDR_LEN]; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = ha->ifp; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == Q8_MAX_NUM_MULTICAST_ADDRS) break; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * Q8_MAC_ADDR_LEN], Q8_MAC_ADDR_LEN); mcnt++; } if_maddr_runlock(ifp); if (QLA_LOCK(ha, __func__, 1) == 0) { qls_hw_set_multi(ha, mta, mcnt, add_multi); QLA_UNLOCK(ha, __func__); } return; } static int qls_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { int ret = 0; struct ifreq *ifr = (struct ifreq *)data; struct ifaddr *ifa = (struct ifaddr *)data; qla_host_t *ha; ha = (qla_host_t *)ifp->if_softc; switch (cmd) { case SIOCSIFADDR: QL_DPRINT4((ha->pci_dev, "%s: SIOCSIFADDR (0x%lx)\n", __func__, cmd)); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { (void)QLA_LOCK(ha, __func__, 0); qls_init_locked(ha); QLA_UNLOCK(ha, __func__); } QL_DPRINT4((ha->pci_dev, "%s: SIOCSIFADDR (0x%lx) ipv4 [0x%08x]\n", __func__, cmd, ntohl(IA_SIN(ifa)->sin_addr.s_addr))); arp_ifinit(ifp, ifa); } else { ether_ioctl(ifp, cmd, data); } break; case SIOCSIFMTU: QL_DPRINT4((ha->pci_dev, "%s: SIOCSIFMTU (0x%lx)\n", __func__, cmd)); if (ifr->ifr_mtu > QLA_MAX_MTU) { ret = EINVAL; } else { (void) QLA_LOCK(ha, __func__, 0); ifp->if_mtu = ifr->ifr_mtu; ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; QLA_UNLOCK(ha, __func__); if (ret) ret = EINVAL; } break; case SIOCSIFFLAGS: QL_DPRINT4((ha->pci_dev, "%s: SIOCSIFFLAGS (0x%lx)\n", __func__, cmd)); (void)QLA_LOCK(ha, __func__, 0); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ ha->if_flags) & IFF_PROMISC) { ret = qls_set_promisc(ha); } else if ((ifp->if_flags ^ ha->if_flags) & IFF_ALLMULTI) { ret = qls_set_allmulti(ha); } } else { ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; qls_init_locked(ha); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) qls_stop(ha); ha->if_flags = ifp->if_flags; } QLA_UNLOCK(ha, __func__); break; case SIOCADDMULTI: QL_DPRINT4((ha->pci_dev, "%s: %s (0x%lx)\n", __func__, "SIOCADDMULTI", cmd)); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { qls_set_multi(ha, 1); } break; case SIOCDELMULTI: QL_DPRINT4((ha->pci_dev, "%s: %s (0x%lx)\n", __func__, "SIOCDELMULTI", cmd)); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { qls_set_multi(ha, 0); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: QL_DPRINT4((ha->pci_dev, "%s: SIOCSIFMEDIA/SIOCGIFMEDIA (0x%lx)\n", __func__, cmd)); ret = ifmedia_ioctl(ifp, ifr, &ha->media, cmd); break; case SIOCSIFCAP: { int mask = ifr->ifr_reqcap ^ ifp->if_capenable; QL_DPRINT4((ha->pci_dev, "%s: SIOCSIFCAP (0x%lx)\n", __func__, cmd)); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) qls_init(ha); VLAN_CAPABILITIES(ifp); break; } default: QL_DPRINT4((ha->pci_dev, "%s: default (0x%lx)\n", __func__, cmd)); ret = ether_ioctl(ifp, cmd, data); break; } return (ret); } static int qls_media_change(struct ifnet *ifp) { qla_host_t *ha; struct ifmedia *ifm; int ret = 0; ha = (qla_host_t *)ifp->if_softc; QL_DPRINT2((ha->pci_dev, "%s: enter\n", __func__)); ifm = &ha->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) ret = EINVAL; QL_DPRINT2((ha->pci_dev, "%s: exit\n", __func__)); return (ret); } static void qls_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { qla_host_t *ha; ha = (qla_host_t *)ifp->if_softc; QL_DPRINT2((ha->pci_dev, "%s: enter\n", __func__)); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; qls_update_link_state(ha); if (ha->link_up) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= (IFM_FDX | qls_get_optics(ha)); } QL_DPRINT2((ha->pci_dev, "%s: exit (%s)\n", __func__,\ (ha->link_up ? "link_up" : "link_down"))); return; } static void qls_start(struct ifnet *ifp) { int i, ret = 0; struct mbuf *m_head; qla_host_t *ha = (qla_host_t *)ifp->if_softc; QL_DPRINT8((ha->pci_dev, "%s: enter\n", __func__)); if (!mtx_trylock(&ha->tx_lock)) { QL_DPRINT8((ha->pci_dev, "%s: mtx_trylock(&ha->tx_lock) failed\n", __func__)); return; } if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == IFF_DRV_RUNNING) { for (i = 0; i < ha->num_tx_rings; i++) { ret |= qls_hw_tx_done(ha, i); } if (ret == 0) ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) { QL_DPRINT8((ha->pci_dev, "%s: !IFF_DRV_RUNNING\n", __func__)); QLA_TX_UNLOCK(ha); return; } if (!ha->link_up) { qls_update_link_state(ha); if (!ha->link_up) { QL_DPRINT8((ha->pci_dev, "%s: link down\n", __func__)); QLA_TX_UNLOCK(ha); return; } } while (ifp->if_snd.ifq_head != NULL) { IF_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) { QL_DPRINT8((ha->pci_dev, "%s: m_head == NULL\n", __func__)); break; } if (qls_send(ha, &m_head)) { if (m_head == NULL) break; QL_DPRINT8((ha->pci_dev, "%s: PREPEND\n", __func__)); ifp->if_drv_flags |= IFF_DRV_OACTIVE; IF_PREPEND(&ifp->if_snd, m_head); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); } QLA_TX_UNLOCK(ha); QL_DPRINT8((ha->pci_dev, "%s: exit\n", __func__)); return; } static int qls_send(qla_host_t *ha, struct mbuf **m_headp) { bus_dma_segment_t segs[QLA_MAX_SEGMENTS]; bus_dmamap_t map; int nsegs; int ret = -1; uint32_t tx_idx; struct mbuf *m_head = *m_headp; uint32_t txr_idx = 0; QL_DPRINT8((ha->pci_dev, "%s: enter\n", __func__)); - if (m_head->m_flags & M_FLOWID) + /* check if flowid is set */ + if (M_HASHTYPE_GET(m_head) != M_HASHTYPE_NONE) txr_idx = m_head->m_pkthdr.flowid & (ha->num_tx_rings - 1); tx_idx = ha->tx_ring[txr_idx].txr_next; map = ha->tx_ring[txr_idx].tx_buf[tx_idx].map; ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT); if (ret == EFBIG) { struct mbuf *m; QL_DPRINT8((ha->pci_dev, "%s: EFBIG [%d]\n", __func__, m_head->m_pkthdr.len)); m = m_defrag(m_head, M_NOWAIT); if (m == NULL) { ha->err_tx_defrag++; m_freem(m_head); *m_headp = NULL; device_printf(ha->pci_dev, "%s: m_defrag() = NULL [%d]\n", __func__, ret); return (ENOBUFS); } m_head = m; *m_headp = m_head; if ((ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT))) { ha->err_tx_dmamap_load++; device_printf(ha->pci_dev, "%s: bus_dmamap_load_mbuf_sg failed0[%d, %d]\n", __func__, ret, m_head->m_pkthdr.len); if (ret != ENOMEM) { m_freem(m_head); *m_headp = NULL; } return (ret); } } else if (ret) { ha->err_tx_dmamap_load++; device_printf(ha->pci_dev, "%s: bus_dmamap_load_mbuf_sg failed1[%d, %d]\n", __func__, ret, m_head->m_pkthdr.len); if (ret != ENOMEM) { m_freem(m_head); *m_headp = NULL; } return (ret); } QL_ASSERT(ha, (nsegs != 0), ("qls_send: empty packet")); bus_dmamap_sync(ha->tx_tag, map, BUS_DMASYNC_PREWRITE); if (!(ret = qls_hw_send(ha, segs, nsegs, tx_idx, m_head, txr_idx))) { ha->tx_ring[txr_idx].count++; ha->tx_ring[txr_idx].tx_buf[tx_idx].m_head = m_head; ha->tx_ring[txr_idx].tx_buf[tx_idx].map = map; } else { if (ret == EINVAL) { if (m_head) m_freem(m_head); *m_headp = NULL; } } QL_DPRINT8((ha->pci_dev, "%s: exit\n", __func__)); return (ret); } static void qls_stop(qla_host_t *ha) { struct ifnet *ifp = ha->ifp; device_t dev; dev = ha->pci_dev; ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE | IFF_DRV_RUNNING); ha->flags.qla_watchdog_pause = 1; while (!ha->qla_watchdog_paused) qls_mdelay(__func__, 1); qls_del_hw_if(ha); qls_free_lro(ha); qls_flush_xmt_bufs(ha); qls_free_rcv_bufs(ha); return; } /* * Buffer Management Functions for Transmit and Receive Rings */ /* * Release mbuf after it sent on the wire */ static void qls_flush_tx_buf(qla_host_t *ha, qla_tx_buf_t *txb) { QL_DPRINT2((ha->pci_dev, "%s: enter\n", __func__)); if (txb->m_head) { bus_dmamap_unload(ha->tx_tag, txb->map); m_freem(txb->m_head); txb->m_head = NULL; } QL_DPRINT2((ha->pci_dev, "%s: exit\n", __func__)); } static void qls_flush_xmt_bufs(qla_host_t *ha) { int i, j; for (j = 0; j < ha->num_tx_rings; j++) { for (i = 0; i < NUM_TX_DESCRIPTORS; i++) qls_flush_tx_buf(ha, &ha->tx_ring[j].tx_buf[i]); } return; } static int qls_alloc_rcv_mbufs(qla_host_t *ha, int r) { int i, j, ret = 0; qla_rx_buf_t *rxb; qla_rx_ring_t *rx_ring; volatile q81_bq_addr_e_t *sbq_e; rx_ring = &ha->rx_ring[r]; for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &rx_ring->rx_buf[i]; ret = bus_dmamap_create(ha->rx_tag, BUS_DMA_NOWAIT, &rxb->map); if (ret) { device_printf(ha->pci_dev, "%s: dmamap[%d, %d] failed\n", __func__, r, i); for (j = 0; j < i; j++) { rxb = &rx_ring->rx_buf[j]; bus_dmamap_destroy(ha->rx_tag, rxb->map); } goto qls_alloc_rcv_mbufs_err; } } rx_ring = &ha->rx_ring[r]; sbq_e = rx_ring->sbq_vaddr; rxb = &rx_ring->rx_buf[0]; for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { if (!(ret = qls_get_mbuf(ha, rxb, NULL))) { /* * set the physical address in the * corresponding descriptor entry in the * receive ring/queue for the hba */ sbq_e->addr_lo = rxb->paddr & 0xFFFFFFFF; sbq_e->addr_hi = (rxb->paddr >> 32) & 0xFFFFFFFF; } else { device_printf(ha->pci_dev, "%s: qls_get_mbuf [%d, %d] failed\n", __func__, r, i); bus_dmamap_destroy(ha->rx_tag, rxb->map); goto qls_alloc_rcv_mbufs_err; } rxb++; sbq_e++; } return 0; qls_alloc_rcv_mbufs_err: return (-1); } static void qls_free_rcv_bufs(qla_host_t *ha) { int i, r; qla_rx_buf_t *rxb; qla_rx_ring_t *rxr; for (r = 0; r < ha->num_rx_rings; r++) { rxr = &ha->rx_ring[r]; for (i = 0; i < NUM_RX_DESCRIPTORS; i++) { rxb = &rxr->rx_buf[i]; if (rxb->m_head != NULL) { bus_dmamap_unload(ha->rx_tag, rxb->map); bus_dmamap_destroy(ha->rx_tag, rxb->map); m_freem(rxb->m_head); } } bzero(rxr->rx_buf, (sizeof(qla_rx_buf_t) * NUM_RX_DESCRIPTORS)); } return; } static int qls_alloc_rcv_bufs(qla_host_t *ha) { int r, ret = 0; qla_rx_ring_t *rxr; for (r = 0; r < ha->num_rx_rings; r++) { rxr = &ha->rx_ring[r]; bzero(rxr->rx_buf, (sizeof(qla_rx_buf_t) * NUM_RX_DESCRIPTORS)); } for (r = 0; r < ha->num_rx_rings; r++) { ret = qls_alloc_rcv_mbufs(ha, r); if (ret) qls_free_rcv_bufs(ha); } return (ret); } int qls_get_mbuf(qla_host_t *ha, qla_rx_buf_t *rxb, struct mbuf *nmp) { register struct mbuf *mp = nmp; struct ifnet *ifp; int ret = 0; uint32_t offset; bus_dma_segment_t segs[1]; int nsegs; QL_DPRINT2((ha->pci_dev, "%s: enter\n", __func__)); ifp = ha->ifp; if (mp == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ha->msize); if (mp == NULL) { if (ha->msize == MCLBYTES) ha->err_m_getcl++; else ha->err_m_getjcl++; ret = ENOBUFS; device_printf(ha->pci_dev, "%s: m_getcl failed\n", __func__); goto exit_qls_get_mbuf; } mp->m_len = mp->m_pkthdr.len = ha->msize; } else { mp->m_len = mp->m_pkthdr.len = ha->msize; mp->m_data = mp->m_ext.ext_buf; mp->m_next = NULL; } /* align the receive buffers to 8 byte boundary */ offset = (uint32_t)((unsigned long long)mp->m_data & 0x7ULL); if (offset) { offset = 8 - offset; m_adj(mp, offset); } /* * Using memory from the mbuf cluster pool, invoke the bus_dma * machinery to arrange the memory mapping. */ ret = bus_dmamap_load_mbuf_sg(ha->rx_tag, rxb->map, mp, segs, &nsegs, BUS_DMA_NOWAIT); rxb->paddr = segs[0].ds_addr; if (ret || !rxb->paddr || (nsegs != 1)) { m_freem(mp); rxb->m_head = NULL; device_printf(ha->pci_dev, "%s: bus_dmamap_load failed[%d, 0x%016llx, %d]\n", __func__, ret, (long long unsigned int)rxb->paddr, nsegs); ret = -1; goto exit_qls_get_mbuf; } rxb->m_head = mp; bus_dmamap_sync(ha->rx_tag, rxb->map, BUS_DMASYNC_PREREAD); exit_qls_get_mbuf: QL_DPRINT2((ha->pci_dev, "%s: exit ret = 0x%08x\n", __func__, ret)); return (ret); } static void qls_tx_done(void *context, int pending) { qla_host_t *ha = context; struct ifnet *ifp; ifp = ha->ifp; if (!ifp) return; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { QL_DPRINT8((ha->pci_dev, "%s: !IFF_DRV_RUNNING\n", __func__)); return; } qls_start(ha->ifp); return; } static int qls_config_lro(qla_host_t *ha) { int i; struct lro_ctrl *lro; for (i = 0; i < ha->num_rx_rings; i++) { lro = &ha->rx_ring[i].lro; if (tcp_lro_init(lro)) { device_printf(ha->pci_dev, "%s: tcp_lro_init failed\n", __func__); return (-1); } lro->ifp = ha->ifp; } ha->flags.lro_init = 1; QL_DPRINT2((ha->pci_dev, "%s: LRO initialized\n", __func__)); return (0); } static void qls_free_lro(qla_host_t *ha) { int i; struct lro_ctrl *lro; if (!ha->flags.lro_init) return; for (i = 0; i < ha->num_rx_rings; i++) { lro = &ha->rx_ring[i].lro; tcp_lro_free(lro); } ha->flags.lro_init = 0; } static void qls_error_recovery(void *context, int pending) { qla_host_t *ha = context; qls_init(ha); return; } Index: head/sys/dev/sfxge/sfxge_rx.c =================================================================== --- head/sys/dev/sfxge/sfxge_rx.c (revision 275357) +++ head/sys/dev/sfxge/sfxge_rx.c (revision 275358) @@ -1,1234 +1,1234 @@ /*- * Copyright (c) 2010-2011 Solarflare Communications, Inc. * All rights reserved. * * This software was developed in part by Philip Paeps under contract for * Solarflare Communications, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/efx.h" #include "sfxge.h" #include "sfxge_rx.h" #define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) /* Size of the LRO hash table. Must be a power of 2. A larger table * means we can accelerate a larger number of streams. */ static unsigned lro_table_size = 128; /* Maximum length of a hash chain. If chains get too long then the lookup * time increases and may exceed the benefit of LRO. */ static unsigned lro_chain_max = 20; /* Maximum time (in ticks) that a connection can be idle before it's LRO * state is discarded. */ static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ /* Number of packets with payload that must arrive in-order before a * connection is eligible for LRO. The idea is we should avoid coalescing * segments when the sender is in slow-start because reducing the ACK rate * can damage performance. */ static int lro_slow_start_packets = 2000; /* Number of packets with payload that must arrive in-order following loss * before a connection is eligible for LRO. The idea is we should avoid * coalescing segments when the sender is recovering from loss, because * reducing the ACK rate can damage performance. */ static int lro_loss_packets = 20; /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ #define SFXGE_LRO_L2_ID_VLAN 0x4000 #define SFXGE_LRO_L2_ID_IPV6 0x8000 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) /* Compare IPv6 addresses, avoiding conditional branches */ static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left, const struct in6_addr *right) { #if LONG_BIT == 64 const uint64_t *left64 = (const uint64_t *)left; const uint64_t *right64 = (const uint64_t *)right; return (left64[0] - right64[0]) | (left64[1] - right64[1]); #else return (left->s6_addr32[0] - right->s6_addr32[0]) | (left->s6_addr32[1] - right->s6_addr32[1]) | (left->s6_addr32[2] - right->s6_addr32[2]) | (left->s6_addr32[3] - right->s6_addr32[3]); #endif } void sfxge_rx_qflush_done(struct sfxge_rxq *rxq) { rxq->flush_state = SFXGE_FLUSH_DONE; } void sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) { rxq->flush_state = SFXGE_FLUSH_FAILED; } static uint8_t toep_key[] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; static void sfxge_rx_post_refill(void *arg) { struct sfxge_rxq *rxq = arg; struct sfxge_softc *sc; unsigned int index; struct sfxge_evq *evq; uint16_t magic; sc = rxq->sc; index = rxq->index; evq = sc->evq[index]; magic = SFXGE_MAGIC_RX_QREFILL | index; /* This is guaranteed due to the start/stop order of rx and ev */ KASSERT(evq->init_state == SFXGE_EVQ_STARTED, ("evq not started")); KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, ("rxq not started")); efx_ev_qpost(evq->common, magic); } static void sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) { /* Initially retry after 100 ms, but back off in case of * repeated failures as we probably have to wait for the * administrator to raise the pool limit. */ if (retrying) rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); else rxq->refill_delay = hz / 10; callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, sfxge_rx_post_refill, rxq); } static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc) { struct mb_args args; struct mbuf *m; /* Allocate mbuf structure */ args.flags = M_PKTHDR; args.type = MT_DATA; m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_NOWAIT); /* Allocate (and attach) packet buffer */ if (m != NULL && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_NOWAIT)) { uma_zfree(zone_mbuf, m); m = NULL; } return (m); } #define SFXGE_REFILL_BATCH 64 static void sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) { struct sfxge_softc *sc; unsigned int index; struct sfxge_evq *evq; unsigned int batch; unsigned int rxfill; unsigned int mblksize; int ntodo; efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; sc = rxq->sc; index = rxq->index; evq = sc->evq[index]; prefetch_read_many(sc->enp); prefetch_read_many(rxq->common); mtx_assert(&evq->lock, MA_OWNED); if (rxq->init_state != SFXGE_RXQ_STARTED) return; rxfill = rxq->added - rxq->completed; KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); if (ntodo == 0) return; batch = 0; mblksize = sc->rx_buffer_size; while (ntodo-- > 0) { unsigned int id; struct sfxge_rx_sw_desc *rx_desc; bus_dma_segment_t seg; struct mbuf *m; id = (rxq->added + batch) & rxq->ptr_mask; rx_desc = &rxq->queue[id]; KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); rx_desc->flags = EFX_DISCARD; m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc); if (m == NULL) break; sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); addr[batch++] = seg.ds_addr; if (batch == SFXGE_REFILL_BATCH) { efx_rx_qpost(rxq->common, addr, mblksize, batch, rxq->completed, rxq->added); rxq->added += batch; batch = 0; } } if (ntodo != 0) sfxge_rx_schedule_refill(rxq, retrying); if (batch != 0) { efx_rx_qpost(rxq->common, addr, mblksize, batch, rxq->completed, rxq->added); rxq->added += batch; } /* Make the descriptors visible to the hardware */ bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, BUS_DMASYNC_PREWRITE); efx_rx_qpush(rxq->common, rxq->added); } void sfxge_rx_qrefill(struct sfxge_rxq *rxq) { if (rxq->init_state != SFXGE_RXQ_STARTED) return; /* Make sure the queue is full */ sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); } static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) { struct ifnet *ifp = sc->ifnet; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_data = 0xffff; ifp->if_input(ifp, m); } static void sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) { struct mbuf *m = rx_desc->mbuf; int csum_flags; /* Convert checksum flags */ csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ? (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; if (rx_desc->flags & EFX_CKSUM_TCPUDP) csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; #ifdef SFXGE_HAVE_MQ /* The hash covers a 4-tuple for TCP only */ if (rx_desc->flags & EFX_PKT_TCP) { m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, mtod(m, uint8_t *)); - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } #endif m->m_data += sc->rx_prefix_size; m->m_len = rx_desc->size - sc->rx_prefix_size; m->m_pkthdr.len = m->m_len; m->m_pkthdr.csum_flags = csum_flags; __sfxge_rx_deliver(sc, rx_desc->mbuf); rx_desc->flags = EFX_DISCARD; rx_desc->mbuf = NULL; } static void sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) { struct sfxge_softc *sc = st->sc; struct mbuf *m = c->mbuf; struct tcphdr *c_th; int csum_flags; KASSERT(m, ("no mbuf to deliver")); ++st->n_bursts; /* Finish off packet munging and recalculate IP header checksum. */ if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { struct ip *iph = c->nh; iph->ip_len = htons(iph->ip_len); iph->ip_sum = 0; iph->ip_sum = in_cksum_hdr(iph); c_th = (struct tcphdr *)(iph + 1); csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID); } else { struct ip6_hdr *iph = c->nh; iph->ip6_plen = htons(iph->ip6_plen); c_th = (struct tcphdr *)(iph + 1); csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; } c_th->th_win = c->th_last->th_win; c_th->th_ack = c->th_last->th_ack; if (c_th->th_off == c->th_last->th_off) { /* Copy TCP options (take care to avoid going negative). */ int optlen = ((c_th->th_off - 5) & 0xf) << 2u; memcpy(c_th + 1, c->th_last + 1, optlen); } #ifdef SFXGE_HAVE_MQ m->m_pkthdr.flowid = c->conn_hash; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); #endif m->m_pkthdr.csum_flags = csum_flags; __sfxge_rx_deliver(sc, m); c->mbuf = NULL; c->delivered = 1; } /* Drop the given connection, and add it to the free list. */ static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) { unsigned bucket; KASSERT(!c->mbuf, ("found orphaned mbuf")); if (c->next_buf.mbuf != NULL) { sfxge_rx_deliver(rxq->sc, &c->next_buf); LIST_REMOVE(c, active_link); } bucket = c->conn_hash & rxq->lro.conns_mask; KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); --rxq->lro.conns_n[bucket]; TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); } /* Stop tracking connections that have gone idle in order to keep hash * chains short. */ static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) { struct sfxge_lro_conn *c; unsigned i; KASSERT(LIST_EMPTY(&rxq->lro.active_conns), ("found active connections")); rxq->lro.last_purge_ticks = now; for (i = 0; i <= rxq->lro.conns_mask; ++i) { if (TAILQ_EMPTY(&rxq->lro.conns[i])) continue; c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); if (now - c->last_pkt_ticks > lro_idle_ticks) { ++rxq->lro.n_drop_idle; sfxge_lro_drop(rxq, c); } } } static void sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, struct mbuf *mbuf, struct tcphdr *th) { struct tcphdr *c_th; /* Tack the new mbuf onto the chain. */ KASSERT(!mbuf->m_next, ("mbuf already chained")); c->mbuf_tail->m_next = mbuf; c->mbuf_tail = mbuf; /* Increase length appropriately */ c->mbuf->m_pkthdr.len += mbuf->m_len; /* Update the connection state flags */ if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { struct ip *iph = c->nh; iph->ip_len += mbuf->m_len; c_th = (struct tcphdr *)(iph + 1); } else { struct ip6_hdr *iph = c->nh; iph->ip6_plen += mbuf->m_len; c_th = (struct tcphdr *)(iph + 1); } c_th->th_flags |= (th->th_flags & TH_PUSH); c->th_last = th; ++st->n_merges; /* Pass packet up now if another segment could overflow the IP * length. */ if (c->mbuf->m_pkthdr.len > 65536 - 9200) sfxge_lro_deliver(st, c); } static void sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, struct mbuf *mbuf, void *nh, struct tcphdr *th) { /* Start the chain */ c->mbuf = mbuf; c->mbuf_tail = c->mbuf; c->nh = nh; c->th_last = th; mbuf->m_pkthdr.len = mbuf->m_len; /* Mangle header fields for later processing */ if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { struct ip *iph = nh; iph->ip_len = ntohs(iph->ip_len); } else { struct ip6_hdr *iph = nh; iph->ip6_plen = ntohs(iph->ip6_plen); } } /* Try to merge or otherwise hold or deliver (as appropriate) the * packet buffered for this connection (c->next_buf). Return a flag * indicating whether the connection is still active for LRO purposes. */ static int sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) { struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; char *eh = c->next_eh; int data_length, hdr_length, dont_merge; unsigned th_seq, pkt_length; struct tcphdr *th; unsigned now; if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { struct ip *iph = c->next_nh; th = (struct tcphdr *)(iph + 1); pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; } else { struct ip6_hdr *iph = c->next_nh; th = (struct tcphdr *)(iph + 1); pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; } hdr_length = (char *) th + th->th_off * 4 - eh; data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - hdr_length); th_seq = ntohl(th->th_seq); dont_merge = ((data_length <= 0) | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); /* Check for options other than aligned timestamp. */ if (th->th_off != 5) { const uint32_t *opt_ptr = (const uint32_t *) (th + 1); if (th->th_off == 8 && opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { /* timestamp option -- okay */ } else { dont_merge = 1; } } if (__predict_false(th_seq != c->next_seq)) { /* Out-of-order, so start counting again. */ if (c->mbuf != NULL) sfxge_lro_deliver(&rxq->lro, c); c->n_in_order_pkts -= lro_loss_packets; c->next_seq = th_seq + data_length; ++rxq->lro.n_misorder; goto deliver_buf_out; } c->next_seq = th_seq + data_length; now = ticks; if (now - c->last_pkt_ticks > lro_idle_ticks) { ++rxq->lro.n_drop_idle; if (c->mbuf != NULL) sfxge_lro_deliver(&rxq->lro, c); sfxge_lro_drop(rxq, c); return (0); } c->last_pkt_ticks = ticks; if (c->n_in_order_pkts < lro_slow_start_packets) { /* May be in slow-start, so don't merge. */ ++rxq->lro.n_slow_start; ++c->n_in_order_pkts; goto deliver_buf_out; } if (__predict_false(dont_merge)) { if (c->mbuf != NULL) sfxge_lro_deliver(&rxq->lro, c); if (th->th_flags & (TH_FIN | TH_RST)) { ++rxq->lro.n_drop_closed; sfxge_lro_drop(rxq, c); return (0); } goto deliver_buf_out; } rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; if (__predict_true(c->mbuf != NULL)) { /* Remove headers and any padding */ rx_buf->mbuf->m_data += hdr_length; rx_buf->mbuf->m_len = data_length; sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); } else { /* Remove any padding */ rx_buf->mbuf->m_len = pkt_length; sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); } rx_buf->mbuf = NULL; return (1); deliver_buf_out: sfxge_rx_deliver(rxq->sc, rx_buf); return (1); } static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, uint16_t l2_id, void *nh, struct tcphdr *th) { unsigned bucket = conn_hash & st->conns_mask; struct sfxge_lro_conn *c; if (st->conns_n[bucket] >= lro_chain_max) { ++st->n_too_many; return; } if (!TAILQ_EMPTY(&st->free_conns)) { c = TAILQ_FIRST(&st->free_conns); TAILQ_REMOVE(&st->free_conns, c, link); } else { c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); if (c == NULL) return; c->mbuf = NULL; c->next_buf.mbuf = NULL; } /* Create the connection tracking data */ ++st->conns_n[bucket]; TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); c->l2_id = l2_id; c->conn_hash = conn_hash; c->source = th->th_sport; c->dest = th->th_dport; c->n_in_order_pkts = 0; c->last_pkt_ticks = *(volatile int *)&ticks; c->delivered = 0; ++st->n_new_stream; /* NB. We don't initialise c->next_seq, and it doesn't matter what * value it has. Most likely the next packet received for this * connection will not match -- no harm done. */ } /* Process mbuf and decide whether to dispatch it to the stack now or * later. */ static void sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) { struct sfxge_softc *sc = rxq->sc; struct mbuf *m = rx_buf->mbuf; struct ether_header *eh; struct sfxge_lro_conn *c; uint16_t l2_id; uint16_t l3_proto; void *nh; struct tcphdr *th; uint32_t conn_hash; unsigned bucket; /* Get the hardware hash */ conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ, mtod(m, uint8_t *)); eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | SFXGE_LRO_L2_ID_VLAN; l3_proto = veh->evl_proto; nh = veh + 1; } else { l2_id = 0; l3_proto = eh->ether_type; nh = eh + 1; } /* Check whether this is a suitable packet (unfragmented * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and * length, and compute a hash if necessary. If not, return. */ if (l3_proto == htons(ETHERTYPE_IP)) { struct ip *iph = nh; if ((iph->ip_p - IPPROTO_TCP) | (iph->ip_hl - (sizeof(*iph) >> 2u)) | (iph->ip_off & htons(IP_MF | IP_OFFMASK))) goto deliver_now; th = (struct tcphdr *)(iph + 1); } else if (l3_proto == htons(ETHERTYPE_IPV6)) { struct ip6_hdr *iph = nh; if (iph->ip6_nxt != IPPROTO_TCP) goto deliver_now; l2_id |= SFXGE_LRO_L2_ID_IPV6; th = (struct tcphdr *)(iph + 1); } else { goto deliver_now; } bucket = conn_hash & rxq->lro.conns_mask; TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) continue; if ((c->source - th->th_sport) | (c->dest - th->th_dport)) continue; if (c->mbuf != NULL) { if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { struct ip *c_iph, *iph = nh; c_iph = c->nh; if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) continue; } else { struct ip6_hdr *c_iph, *iph = nh; c_iph = c->nh; if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) continue; } } /* Re-insert at head of list to reduce lookup time. */ TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); if (c->next_buf.mbuf != NULL) { if (!sfxge_lro_try_merge(rxq, c)) goto deliver_now; } else { LIST_INSERT_HEAD(&rxq->lro.active_conns, c, active_link); } c->next_buf = *rx_buf; c->next_eh = eh; c->next_nh = nh; rx_buf->mbuf = NULL; rx_buf->flags = EFX_DISCARD; return; } sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); deliver_now: sfxge_rx_deliver(sc, rx_buf); } static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) { struct sfxge_lro_state *st = &rxq->lro; struct sfxge_lro_conn *c; unsigned t; while (!LIST_EMPTY(&st->active_conns)) { c = LIST_FIRST(&st->active_conns); if (!c->delivered && c->mbuf != NULL) sfxge_lro_deliver(st, c); if (sfxge_lro_try_merge(rxq, c)) { if (c->mbuf != NULL) sfxge_lro_deliver(st, c); LIST_REMOVE(c, active_link); } c->delivered = 0; } t = *(volatile int *)&ticks; if (__predict_false(t != st->last_purge_ticks)) sfxge_lro_purge_idle(rxq, t); } void sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) { struct sfxge_softc *sc = rxq->sc; int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO; unsigned int index; struct sfxge_evq *evq; unsigned int completed; unsigned int level; struct mbuf *m; struct sfxge_rx_sw_desc *prev = NULL; index = rxq->index; evq = sc->evq[index]; mtx_assert(&evq->lock, MA_OWNED); completed = rxq->completed; while (completed != rxq->pending) { unsigned int id; struct sfxge_rx_sw_desc *rx_desc; id = completed++ & rxq->ptr_mask; rx_desc = &rxq->queue[id]; m = rx_desc->mbuf; if (rxq->init_state != SFXGE_RXQ_STARTED) goto discard; if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) goto discard; prefetch_read_many(mtod(m, caddr_t)); /* Check for loopback packets */ if (!(rx_desc->flags & EFX_PKT_IPV4) && !(rx_desc->flags & EFX_PKT_IPV6)) { struct ether_header *etherhp; /*LINTED*/ etherhp = mtod(m, struct ether_header *); if (etherhp->ether_type == htons(SFXGE_ETHERTYPE_LOOPBACK)) { EFSYS_PROBE(loopback); rxq->loopback++; goto discard; } } /* Pass packet up the stack or into LRO (pipelined) */ if (prev != NULL) { if (lro_enabled) sfxge_lro(rxq, prev); else sfxge_rx_deliver(sc, prev); } prev = rx_desc; continue; discard: /* Return the packet to the pool */ m_free(m); rx_desc->mbuf = NULL; } rxq->completed = completed; level = rxq->added - rxq->completed; /* Pass last packet up the stack or into LRO */ if (prev != NULL) { if (lro_enabled) sfxge_lro(rxq, prev); else sfxge_rx_deliver(sc, prev); } /* * If there are any pending flows and this is the end of the * poll then they must be completed. */ if (eop) sfxge_lro_end_of_burst(rxq); /* Top up the queue if necessary */ if (level < rxq->refill_threshold) sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); } static void sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) { struct sfxge_rxq *rxq; struct sfxge_evq *evq; unsigned int count; rxq = sc->rxq[index]; evq = sc->evq[index]; mtx_lock(&evq->lock); KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, ("rxq not started")); rxq->init_state = SFXGE_RXQ_INITIALIZED; callout_stop(&rxq->refill_callout); again: rxq->flush_state = SFXGE_FLUSH_PENDING; /* Flush the receive queue */ efx_rx_qflush(rxq->common); mtx_unlock(&evq->lock); count = 0; do { /* Spin for 100 ms */ DELAY(100000); if (rxq->flush_state != SFXGE_FLUSH_PENDING) break; } while (++count < 20); mtx_lock(&evq->lock); if (rxq->flush_state == SFXGE_FLUSH_FAILED) goto again; rxq->flush_state = SFXGE_FLUSH_DONE; rxq->pending = rxq->added; sfxge_rx_qcomplete(rxq, B_TRUE); KASSERT(rxq->completed == rxq->pending, ("rxq->completed != rxq->pending")); rxq->added = 0; rxq->pending = 0; rxq->completed = 0; rxq->loopback = 0; /* Destroy the common code receive queue. */ efx_rx_qdestroy(rxq->common); efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, EFX_RXQ_NBUFS(sc->rxq_entries)); mtx_unlock(&evq->lock); } static int sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) { struct sfxge_rxq *rxq; efsys_mem_t *esmp; struct sfxge_evq *evq; int rc; rxq = sc->rxq[index]; esmp = &rxq->mem; evq = sc->evq[index]; KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); KASSERT(evq->init_state == SFXGE_EVQ_STARTED, ("evq->init_state != SFXGE_EVQ_STARTED")); /* Program the buffer table. */ if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) return (rc); /* Create the common code receive queue. */ if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT, esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, &rxq->common)) != 0) goto fail; mtx_lock(&evq->lock); /* Enable the receive queue. */ efx_rx_qenable(rxq->common); rxq->init_state = SFXGE_RXQ_STARTED; /* Try to fill the queue from the pool. */ sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); mtx_unlock(&evq->lock); return (0); fail: efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, EFX_RXQ_NBUFS(sc->rxq_entries)); return (rc); } void sfxge_rx_stop(struct sfxge_softc *sc) { struct sfxge_intr *intr; int index; intr = &sc->intr; /* Stop the receive queue(s) */ index = intr->n_alloc; while (--index >= 0) sfxge_rx_qstop(sc, index); sc->rx_prefix_size = 0; sc->rx_buffer_size = 0; efx_rx_fini(sc->enp); } int sfxge_rx_start(struct sfxge_softc *sc) { struct sfxge_intr *intr; int index; int rc; intr = &sc->intr; /* Initialize the common code receive module. */ if ((rc = efx_rx_init(sc->enp)) != 0) return (rc); /* Calculate the receive packet buffer size. */ sc->rx_prefix_size = EFX_RX_PREFIX_SIZE; sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) + sc->rx_prefix_size); /* Select zone for packet buffers */ if (sc->rx_buffer_size <= MCLBYTES) sc->rx_buffer_zone = zone_clust; else if (sc->rx_buffer_size <= MJUMPAGESIZE) sc->rx_buffer_zone = zone_jumbop; else if (sc->rx_buffer_size <= MJUM9BYTES) sc->rx_buffer_zone = zone_jumbo9; else sc->rx_buffer_zone = zone_jumbo16; /* * Set up the scale table. Enable all hash types and hash insertion. */ for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) sc->rx_indir_table[index] = index % sc->intr.n_alloc; if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, SFXGE_RX_SCALE_MAX)) != 0) goto fail; (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key, sizeof(toep_key))) != 0) goto fail; /* Start the receive queue(s). */ for (index = 0; index < intr->n_alloc; index++) { if ((rc = sfxge_rx_qstart(sc, index)) != 0) goto fail2; } return (0); fail2: while (--index >= 0) sfxge_rx_qstop(sc, index); fail: efx_rx_fini(sc->enp); return (rc); } static void sfxge_lro_init(struct sfxge_rxq *rxq) { struct sfxge_lro_state *st = &rxq->lro; unsigned i; st->conns_mask = lro_table_size - 1; KASSERT(!((st->conns_mask + 1) & st->conns_mask), ("lro_table_size must be a power of 2")); st->sc = rxq->sc; st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), M_SFXGE, M_WAITOK); st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), M_SFXGE, M_WAITOK); for (i = 0; i <= st->conns_mask; ++i) { TAILQ_INIT(&st->conns[i]); st->conns_n[i] = 0; } LIST_INIT(&st->active_conns); TAILQ_INIT(&st->free_conns); } static void sfxge_lro_fini(struct sfxge_rxq *rxq) { struct sfxge_lro_state *st = &rxq->lro; struct sfxge_lro_conn *c; unsigned i; /* Return cleanly if sfxge_lro_init() has not been called. */ if (st->conns == NULL) return; KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); for (i = 0; i <= st->conns_mask; ++i) { while (!TAILQ_EMPTY(&st->conns[i])) { c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); sfxge_lro_drop(rxq, c); } } while (!TAILQ_EMPTY(&st->free_conns)) { c = TAILQ_FIRST(&st->free_conns); TAILQ_REMOVE(&st->free_conns, c, link); KASSERT(!c->mbuf, ("found orphaned mbuf")); free(c, M_SFXGE); } free(st->conns_n, M_SFXGE); free(st->conns, M_SFXGE); st->conns = NULL; } static void sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) { struct sfxge_rxq *rxq; rxq = sc->rxq[index]; KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); /* Free the context array and the flow table. */ free(rxq->queue, M_SFXGE); sfxge_lro_fini(rxq); /* Release DMA memory. */ sfxge_dma_free(&rxq->mem); sc->rxq[index] = NULL; free(rxq, M_SFXGE); } static int sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) { struct sfxge_rxq *rxq; struct sfxge_evq *evq; efsys_mem_t *esmp; int rc; KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc)); rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); rxq->sc = sc; rxq->index = index; rxq->entries = sc->rxq_entries; rxq->ptr_mask = rxq->entries - 1; rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); sc->rxq[index] = rxq; esmp = &rxq->mem; evq = sc->evq[index]; /* Allocate and zero DMA space. */ if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) return (rc); (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(sc->rxq_entries)); /* Allocate buffer table entries. */ sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), &rxq->buf_base_id); /* Allocate the context array and the flow table. */ rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, M_SFXGE, M_WAITOK | M_ZERO); sfxge_lro_init(rxq); callout_init(&rxq->refill_callout, B_TRUE); rxq->init_state = SFXGE_RXQ_INITIALIZED; return (0); } static const struct { const char *name; size_t offset; } sfxge_rx_stats[] = { #define SFXGE_RX_STAT(name, member) \ { #name, offsetof(struct sfxge_rxq, member) } SFXGE_RX_STAT(lro_merges, lro.n_merges), SFXGE_RX_STAT(lro_bursts, lro.n_bursts), SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), SFXGE_RX_STAT(lro_misorder, lro.n_misorder), SFXGE_RX_STAT(lro_too_many, lro.n_too_many), SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) }; static int sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) { struct sfxge_softc *sc = arg1; unsigned int id = arg2; unsigned int sum, index; /* Sum across all RX queues */ sum = 0; for (index = 0; index < sc->intr.n_alloc; index++) sum += *(unsigned int *)((caddr_t)sc->rxq[index] + sfxge_rx_stats[id].offset); return (SYSCTL_OUT(req, &sum, sizeof(sum))); } static void sfxge_rx_stat_init(struct sfxge_softc *sc) { struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); struct sysctl_oid_list *stat_list; unsigned int id; stat_list = SYSCTL_CHILDREN(sc->stats_node); for (id = 0; id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]); id++) { SYSCTL_ADD_PROC( ctx, stat_list, OID_AUTO, sfxge_rx_stats[id].name, CTLTYPE_UINT|CTLFLAG_RD, sc, id, sfxge_rx_stat_handler, "IU", ""); } } void sfxge_rx_fini(struct sfxge_softc *sc) { struct sfxge_intr *intr; int index; intr = &sc->intr; index = intr->n_alloc; while (--index >= 0) sfxge_rx_qfini(sc, index); } int sfxge_rx_init(struct sfxge_softc *sc) { struct sfxge_intr *intr; int index; int rc; if (lro_idle_ticks == 0) lro_idle_ticks = hz / 10 + 1; /* 100 ms */ intr = &sc->intr; KASSERT(intr->state == SFXGE_INTR_INITIALIZED, ("intr->state != SFXGE_INTR_INITIALIZED")); /* Initialize the receive queue(s) - one per interrupt. */ for (index = 0; index < intr->n_alloc; index++) { if ((rc = sfxge_rx_qinit(sc, index)) != 0) goto fail; } sfxge_rx_stat_init(sc); return (0); fail: /* Tear down the receive queue(s). */ while (--index >= 0) sfxge_rx_qfini(sc, index); return (rc); } Index: head/sys/dev/sfxge/sfxge_tx.c =================================================================== --- head/sys/dev/sfxge/sfxge_tx.c (revision 275357) +++ head/sys/dev/sfxge/sfxge_tx.c (revision 275358) @@ -1,1578 +1,1579 @@ /*- * Copyright (c) 2010-2011 Solarflare Communications, Inc. * All rights reserved. * * This software was developed in part by Philip Paeps under contract for * Solarflare Communications, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* Theory of operation: * * Tx queues allocation and mapping * * One Tx queue with enabled checksum offload is allocated per Rx channel * (event queue). Also 2 Tx queues (one without checksum offload and one * with IP checksum offload only) are allocated and bound to event queue 0. * sfxge_txq_type is used as Tx queue label. * * So, event queue plus label mapping to Tx queue index is: * if event queue index is 0, TxQ-index = TxQ-label * [0..SFXGE_TXQ_NTYPES) * else TxQ-index = SFXGE_TXQ_NTYPES + EvQ-index - 1 * See sfxge_get_txq_by_label() sfxge_ev.c */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/efx.h" #include "sfxge.h" #include "sfxge_tx.h" /* Set the block level to ensure there is space to generate a * large number of descriptors for TSO. With minimum MSS and * maximum mbuf length we might need more than a ring-ful of * descriptors, but this should not happen in practice except * due to deliberate attack. In that case we will truncate * the output at a packet boundary. Allow for a reasonable * minimum MSS of 512. */ #define SFXGE_TSO_MAX_DESC ((65535 / 512) * 2 + SFXGE_TX_MAPPING_MAX_SEG - 1) #define SFXGE_TXQ_BLOCK_LEVEL(_entries) ((_entries) - SFXGE_TSO_MAX_DESC) #ifdef SFXGE_HAVE_MQ #define SFXGE_PARAM_TX_DPL_GET_MAX SFXGE_PARAM(tx_dpl_get_max) static int sfxge_tx_dpl_get_max = SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT; TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_MAX, &sfxge_tx_dpl_get_max); SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_max, CTLFLAG_RDTUN, &sfxge_tx_dpl_get_max, 0, "Maximum number of packets in deferred packet get-list"); #define SFXGE_PARAM_TX_DPL_PUT_MAX SFXGE_PARAM(tx_dpl_put_max) static int sfxge_tx_dpl_put_max = SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT; TUNABLE_INT(SFXGE_PARAM_TX_DPL_PUT_MAX, &sfxge_tx_dpl_put_max); SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_put_max, CTLFLAG_RDTUN, &sfxge_tx_dpl_put_max, 0, "Maximum number of packets in deferred packet put-list"); #endif /* Forward declarations. */ static inline void sfxge_tx_qdpl_service(struct sfxge_txq *txq); static void sfxge_tx_qlist_post(struct sfxge_txq *txq); static void sfxge_tx_qunblock(struct sfxge_txq *txq); static int sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf, const bus_dma_segment_t *dma_seg, int n_dma_seg); void sfxge_tx_qcomplete(struct sfxge_txq *txq) { struct sfxge_softc *sc; struct sfxge_evq *evq; unsigned int completed; sc = txq->sc; evq = sc->evq[txq->evq_index]; mtx_assert(&evq->lock, MA_OWNED); completed = txq->completed; while (completed != txq->pending) { struct sfxge_tx_mapping *stmp; unsigned int id; id = completed++ & txq->ptr_mask; stmp = &txq->stmp[id]; if (stmp->flags & TX_BUF_UNMAP) { bus_dmamap_unload(txq->packet_dma_tag, stmp->map); if (stmp->flags & TX_BUF_MBUF) { struct mbuf *m = stmp->u.mbuf; do m = m_free(m); while (m != NULL); } else { free(stmp->u.heap_buf, M_SFXGE); } stmp->flags = 0; } } txq->completed = completed; /* Check whether we need to unblock the queue. */ mb(); if (txq->blocked) { unsigned int level; level = txq->added - txq->completed; if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries)) sfxge_tx_qunblock(txq); } } #ifdef SFXGE_HAVE_MQ /* * Reorder the put list and append it to the get list. */ static void sfxge_tx_qdpl_swizzle(struct sfxge_txq *txq) { struct sfxge_tx_dpl *stdp; struct mbuf *mbuf, *get_next, **get_tailp; volatile uintptr_t *putp; uintptr_t put; unsigned int count; mtx_assert(&txq->lock, MA_OWNED); stdp = &txq->dpl; /* Acquire the put list. */ putp = &stdp->std_put; put = atomic_readandclear_ptr(putp); mbuf = (void *)put; if (mbuf == NULL) return; /* Reverse the put list. */ get_tailp = &mbuf->m_nextpkt; get_next = NULL; count = 0; do { struct mbuf *put_next; put_next = mbuf->m_nextpkt; mbuf->m_nextpkt = get_next; get_next = mbuf; mbuf = put_next; count++; } while (mbuf != NULL); /* Append the reversed put list to the get list. */ KASSERT(*get_tailp == NULL, ("*get_tailp != NULL")); *stdp->std_getp = get_next; stdp->std_getp = get_tailp; stdp->std_get_count += count; } #endif /* SFXGE_HAVE_MQ */ static void sfxge_tx_qreap(struct sfxge_txq *txq) { mtx_assert(SFXGE_TXQ_LOCK(txq), MA_OWNED); txq->reaped = txq->completed; } static void sfxge_tx_qlist_post(struct sfxge_txq *txq) { unsigned int old_added; unsigned int level; int rc; mtx_assert(SFXGE_TXQ_LOCK(txq), MA_OWNED); KASSERT(txq->n_pend_desc != 0, ("txq->n_pend_desc == 0")); KASSERT(txq->n_pend_desc <= SFXGE_TSO_MAX_DESC, ("txq->n_pend_desc too large")); KASSERT(!txq->blocked, ("txq->blocked")); old_added = txq->added; /* Post the fragment list. */ rc = efx_tx_qpost(txq->common, txq->pend_desc, txq->n_pend_desc, txq->reaped, &txq->added); KASSERT(rc == 0, ("efx_tx_qpost() failed")); /* If efx_tx_qpost() had to refragment, our information about * buffers to free may be associated with the wrong * descriptors. */ KASSERT(txq->added - old_added == txq->n_pend_desc, ("efx_tx_qpost() refragmented descriptors")); level = txq->added - txq->reaped; KASSERT(level <= txq->entries, ("overfilled TX queue")); /* Clear the fragment list. */ txq->n_pend_desc = 0; /* Have we reached the block level? */ if (level < SFXGE_TXQ_BLOCK_LEVEL(txq->entries)) return; /* Reap, and check again */ sfxge_tx_qreap(txq); level = txq->added - txq->reaped; if (level < SFXGE_TXQ_BLOCK_LEVEL(txq->entries)) return; txq->blocked = 1; /* * Avoid a race with completion interrupt handling that could leave * the queue blocked. */ mb(); sfxge_tx_qreap(txq); level = txq->added - txq->reaped; if (level < SFXGE_TXQ_BLOCK_LEVEL(txq->entries)) { mb(); txq->blocked = 0; } } static int sfxge_tx_queue_mbuf(struct sfxge_txq *txq, struct mbuf *mbuf) { bus_dmamap_t *used_map; bus_dmamap_t map; bus_dma_segment_t dma_seg[SFXGE_TX_MAPPING_MAX_SEG]; unsigned int id; struct sfxge_tx_mapping *stmp; efx_buffer_t *desc; int n_dma_seg; int rc; int i; KASSERT(!txq->blocked, ("txq->blocked")); if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) prefetch_read_many(mbuf->m_data); if (txq->init_state != SFXGE_TXQ_STARTED) { rc = EINTR; goto reject; } /* Load the packet for DMA. */ id = txq->added & txq->ptr_mask; stmp = &txq->stmp[id]; rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag, stmp->map, mbuf, dma_seg, &n_dma_seg, 0); if (rc == EFBIG) { /* Try again. */ struct mbuf *new_mbuf = m_collapse(mbuf, M_NOWAIT, SFXGE_TX_MAPPING_MAX_SEG); if (new_mbuf == NULL) goto reject; ++txq->collapses; mbuf = new_mbuf; rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag, stmp->map, mbuf, dma_seg, &n_dma_seg, 0); } if (rc != 0) goto reject; /* Make the packet visible to the hardware. */ bus_dmamap_sync(txq->packet_dma_tag, stmp->map, BUS_DMASYNC_PREWRITE); used_map = &stmp->map; if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) { rc = sfxge_tx_queue_tso(txq, mbuf, dma_seg, n_dma_seg); if (rc < 0) goto reject_mapped; stmp = &txq->stmp[rc]; } else { /* Add the mapping to the fragment list, and set flags * for the buffer. */ i = 0; for (;;) { desc = &txq->pend_desc[i]; desc->eb_addr = dma_seg[i].ds_addr; desc->eb_size = dma_seg[i].ds_len; if (i == n_dma_seg - 1) { desc->eb_eop = 1; break; } desc->eb_eop = 0; i++; stmp->flags = 0; if (__predict_false(stmp == &txq->stmp[txq->ptr_mask])) stmp = &txq->stmp[0]; else stmp++; } txq->n_pend_desc = n_dma_seg; } /* * If the mapping required more than one descriptor * then we need to associate the DMA map with the last * descriptor, not the first. */ if (used_map != &stmp->map) { map = stmp->map; stmp->map = *used_map; *used_map = map; } stmp->u.mbuf = mbuf; stmp->flags = TX_BUF_UNMAP | TX_BUF_MBUF; /* Post the fragment list. */ sfxge_tx_qlist_post(txq); return (0); reject_mapped: bus_dmamap_unload(txq->packet_dma_tag, *used_map); reject: /* Drop the packet on the floor. */ m_freem(mbuf); ++txq->drops; return (rc); } #ifdef SFXGE_HAVE_MQ /* * Drain the deferred packet list into the transmit queue. */ static void sfxge_tx_qdpl_drain(struct sfxge_txq *txq) { struct sfxge_softc *sc; struct sfxge_tx_dpl *stdp; struct mbuf *mbuf, *next; unsigned int count; unsigned int pushed; int rc; mtx_assert(&txq->lock, MA_OWNED); sc = txq->sc; stdp = &txq->dpl; pushed = txq->added; prefetch_read_many(sc->enp); prefetch_read_many(txq->common); mbuf = stdp->std_get; count = stdp->std_get_count; while (count != 0) { KASSERT(mbuf != NULL, ("mbuf == NULL")); next = mbuf->m_nextpkt; mbuf->m_nextpkt = NULL; ETHER_BPF_MTAP(sc->ifnet, mbuf); /* packet capture */ if (next != NULL) prefetch_read_many(next); rc = sfxge_tx_queue_mbuf(txq, mbuf); --count; mbuf = next; if (rc != 0) continue; if (txq->blocked) break; /* Push the fragments to the hardware in batches. */ if (txq->added - pushed >= SFXGE_TX_BATCH) { efx_tx_qpush(txq->common, txq->added); pushed = txq->added; } } if (count == 0) { KASSERT(mbuf == NULL, ("mbuf != NULL")); stdp->std_get = NULL; stdp->std_get_count = 0; stdp->std_getp = &stdp->std_get; } else { stdp->std_get = mbuf; stdp->std_get_count = count; } if (txq->added != pushed) efx_tx_qpush(txq->common, txq->added); KASSERT(txq->blocked || stdp->std_get_count == 0, ("queue unblocked but count is non-zero")); } #define SFXGE_TX_QDPL_PENDING(_txq) \ ((_txq)->dpl.std_put != 0) /* * Service the deferred packet list. * * NOTE: drops the txq mutex! */ static inline void sfxge_tx_qdpl_service(struct sfxge_txq *txq) { mtx_assert(&txq->lock, MA_OWNED); do { if (SFXGE_TX_QDPL_PENDING(txq)) sfxge_tx_qdpl_swizzle(txq); if (!txq->blocked) sfxge_tx_qdpl_drain(txq); mtx_unlock(&txq->lock); } while (SFXGE_TX_QDPL_PENDING(txq) && mtx_trylock(&txq->lock)); } /* * Put a packet on the deferred packet list. * * If we are called with the txq lock held, we put the packet on the "get * list", otherwise we atomically push it on the "put list". The swizzle * function takes care of ordering. * * The length of the put list is bounded by SFXGE_TX_MAX_DEFFERED. We * overload the csum_data field in the mbuf to keep track of this length * because there is no cheap alternative to avoid races. */ static inline int sfxge_tx_qdpl_put(struct sfxge_txq *txq, struct mbuf *mbuf, int locked) { struct sfxge_tx_dpl *stdp; stdp = &txq->dpl; KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL")); if (locked) { mtx_assert(&txq->lock, MA_OWNED); sfxge_tx_qdpl_swizzle(txq); if (stdp->std_get_count >= stdp->std_get_max) return (ENOBUFS); *(stdp->std_getp) = mbuf; stdp->std_getp = &mbuf->m_nextpkt; stdp->std_get_count++; } else { volatile uintptr_t *putp; uintptr_t old; uintptr_t new; unsigned old_len; putp = &stdp->std_put; new = (uintptr_t)mbuf; do { old = *putp; if (old != 0) { struct mbuf *mp = (struct mbuf *)old; old_len = mp->m_pkthdr.csum_data; } else old_len = 0; if (old_len >= stdp->std_put_max) return (ENOBUFS); mbuf->m_pkthdr.csum_data = old_len + 1; mbuf->m_nextpkt = (void *)old; } while (atomic_cmpset_ptr(putp, old, new) == 0); } return (0); } /* * Called from if_transmit - will try to grab the txq lock and enqueue to the * put list if it succeeds, otherwise will push onto the defer list. */ int sfxge_tx_packet_add(struct sfxge_txq *txq, struct mbuf *m) { int locked; int rc; if (!SFXGE_LINK_UP(txq->sc)) { rc = ENETDOWN; goto fail; } /* * Try to grab the txq lock. If we are able to get the lock, * the packet will be appended to the "get list" of the deferred * packet list. Otherwise, it will be pushed on the "put list". */ locked = mtx_trylock(&txq->lock); if (sfxge_tx_qdpl_put(txq, m, locked) != 0) { if (locked) mtx_unlock(&txq->lock); rc = ENOBUFS; goto fail; } /* * Try to grab the lock again. * * If we are able to get the lock, we need to process the deferred * packet list. If we are not able to get the lock, another thread * is processing the list. */ if (!locked) locked = mtx_trylock(&txq->lock); if (locked) { /* Try to service the list. */ sfxge_tx_qdpl_service(txq); /* Lock has been dropped. */ } return (0); fail: m_freem(m); atomic_add_long(&txq->early_drops, 1); return (rc); } static void sfxge_tx_qdpl_flush(struct sfxge_txq *txq) { struct sfxge_tx_dpl *stdp = &txq->dpl; struct mbuf *mbuf, *next; mtx_lock(&txq->lock); sfxge_tx_qdpl_swizzle(txq); for (mbuf = stdp->std_get; mbuf != NULL; mbuf = next) { next = mbuf->m_nextpkt; m_freem(mbuf); } stdp->std_get = NULL; stdp->std_get_count = 0; stdp->std_getp = &stdp->std_get; mtx_unlock(&txq->lock); } void sfxge_if_qflush(struct ifnet *ifp) { struct sfxge_softc *sc; int i; sc = ifp->if_softc; for (i = 0; i < SFXGE_TX_SCALE(sc); i++) sfxge_tx_qdpl_flush(sc->txq[i]); } /* * TX start -- called by the stack. */ int sfxge_if_transmit(struct ifnet *ifp, struct mbuf *m) { struct sfxge_softc *sc; struct sfxge_txq *txq; int rc; sc = (struct sfxge_softc *)ifp->if_softc; KASSERT(ifp->if_flags & IFF_UP, ("interface not up")); /* Pick the desired transmit queue. */ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_TSO)) { int index = 0; - if (m->m_flags & M_FLOWID) { + /* check if flowid is set */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { uint32_t hash = m->m_pkthdr.flowid; index = sc->rx_indir_table[hash % SFXGE_RX_SCALE_MAX]; } txq = sc->txq[SFXGE_TXQ_IP_TCP_UDP_CKSUM + index]; } else if (m->m_pkthdr.csum_flags & CSUM_DELAY_IP) { txq = sc->txq[SFXGE_TXQ_IP_CKSUM]; } else { txq = sc->txq[SFXGE_TXQ_NON_CKSUM]; } rc = sfxge_tx_packet_add(txq, m); return (rc); } #else /* !SFXGE_HAVE_MQ */ static void sfxge_if_start_locked(struct ifnet *ifp) { struct sfxge_softc *sc = ifp->if_softc; struct sfxge_txq *txq; struct mbuf *mbuf; unsigned int pushed[SFXGE_TXQ_NTYPES]; unsigned int q_index; if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!sc->port.link_up) return; for (q_index = 0; q_index < SFXGE_TXQ_NTYPES; q_index++) { txq = sc->txq[q_index]; pushed[q_index] = txq->added; } while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { IFQ_DRV_DEQUEUE(&ifp->if_snd, mbuf); if (mbuf == NULL) break; ETHER_BPF_MTAP(ifp, mbuf); /* packet capture */ /* Pick the desired transmit queue. */ if (mbuf->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_TSO)) q_index = SFXGE_TXQ_IP_TCP_UDP_CKSUM; else if (mbuf->m_pkthdr.csum_flags & CSUM_DELAY_IP) q_index = SFXGE_TXQ_IP_CKSUM; else q_index = SFXGE_TXQ_NON_CKSUM; txq = sc->txq[q_index]; if (sfxge_tx_queue_mbuf(txq, mbuf) != 0) continue; if (txq->blocked) { ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } /* Push the fragments to the hardware in batches. */ if (txq->added - pushed[q_index] >= SFXGE_TX_BATCH) { efx_tx_qpush(txq->common, txq->added); pushed[q_index] = txq->added; } } for (q_index = 0; q_index < SFXGE_TXQ_NTYPES; q_index++) { txq = sc->txq[q_index]; if (txq->added != pushed[q_index]) efx_tx_qpush(txq->common, txq->added); } } void sfxge_if_start(struct ifnet *ifp) { struct sfxge_softc *sc = ifp->if_softc; mtx_lock(&sc->tx_lock); sfxge_if_start_locked(ifp); mtx_unlock(&sc->tx_lock); } static inline void sfxge_tx_qdpl_service(struct sfxge_txq *txq) { struct sfxge_softc *sc = txq->sc; struct ifnet *ifp = sc->ifnet; mtx_assert(&sc->tx_lock, MA_OWNED); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; sfxge_if_start_locked(ifp); mtx_unlock(&sc->tx_lock); } #endif /* SFXGE_HAVE_MQ */ /* * Software "TSO". Not quite as good as doing it in hardware, but * still faster than segmenting in the stack. */ struct sfxge_tso_state { /* Output position */ unsigned out_len; /* Remaining length in current segment */ unsigned seqnum; /* Current sequence number */ unsigned packet_space; /* Remaining space in current packet */ /* Input position */ unsigned dma_seg_i; /* Current DMA segment number */ uint64_t dma_addr; /* DMA address of current position */ unsigned in_len; /* Remaining length in current mbuf */ const struct mbuf *mbuf; /* Input mbuf (head of chain) */ u_short protocol; /* Network protocol (after VLAN decap) */ ssize_t nh_off; /* Offset of network header */ ssize_t tcph_off; /* Offset of TCP header */ unsigned header_len; /* Number of bytes of header */ int full_packet_size; /* Number of bytes to put in each outgoing * segment */ }; static inline const struct ip *tso_iph(const struct sfxge_tso_state *tso) { KASSERT(tso->protocol == htons(ETHERTYPE_IP), ("tso_iph() in non-IPv4 state")); return (const struct ip *)(tso->mbuf->m_data + tso->nh_off); } static inline const struct ip6_hdr *tso_ip6h(const struct sfxge_tso_state *tso) { KASSERT(tso->protocol == htons(ETHERTYPE_IPV6), ("tso_ip6h() in non-IPv6 state")); return (const struct ip6_hdr *)(tso->mbuf->m_data + tso->nh_off); } static inline const struct tcphdr *tso_tcph(const struct sfxge_tso_state *tso) { return (const struct tcphdr *)(tso->mbuf->m_data + tso->tcph_off); } /* Size of preallocated TSO header buffers. Larger blocks must be * allocated from the heap. */ #define TSOH_STD_SIZE 128 /* At most half the descriptors in the queue at any time will refer to * a TSO header buffer, since they must always be followed by a * payload descriptor referring to an mbuf. */ #define TSOH_COUNT(_txq_entries) ((_txq_entries) / 2u) #define TSOH_PER_PAGE (PAGE_SIZE / TSOH_STD_SIZE) #define TSOH_PAGE_COUNT(_txq_entries) \ ((TSOH_COUNT(_txq_entries) + TSOH_PER_PAGE - 1) / TSOH_PER_PAGE) static int tso_init(struct sfxge_txq *txq) { struct sfxge_softc *sc = txq->sc; unsigned int tsoh_page_count = TSOH_PAGE_COUNT(sc->txq_entries); int i, rc; /* Allocate TSO header buffers */ txq->tsoh_buffer = malloc(tsoh_page_count * sizeof(txq->tsoh_buffer[0]), M_SFXGE, M_WAITOK); for (i = 0; i < tsoh_page_count; i++) { rc = sfxge_dma_alloc(sc, PAGE_SIZE, &txq->tsoh_buffer[i]); if (rc != 0) goto fail; } return (0); fail: while (i-- > 0) sfxge_dma_free(&txq->tsoh_buffer[i]); free(txq->tsoh_buffer, M_SFXGE); txq->tsoh_buffer = NULL; return (rc); } static void tso_fini(struct sfxge_txq *txq) { int i; if (txq->tsoh_buffer != NULL) { for (i = 0; i < TSOH_PAGE_COUNT(txq->sc->txq_entries); i++) sfxge_dma_free(&txq->tsoh_buffer[i]); free(txq->tsoh_buffer, M_SFXGE); } } static void tso_start(struct sfxge_tso_state *tso, struct mbuf *mbuf) { struct ether_header *eh = mtod(mbuf, struct ether_header *); tso->mbuf = mbuf; /* Find network protocol and header */ tso->protocol = eh->ether_type; if (tso->protocol == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *veh = mtod(mbuf, struct ether_vlan_header *); tso->protocol = veh->evl_proto; tso->nh_off = sizeof(*veh); } else { tso->nh_off = sizeof(*eh); } /* Find TCP header */ if (tso->protocol == htons(ETHERTYPE_IP)) { KASSERT(tso_iph(tso)->ip_p == IPPROTO_TCP, ("TSO required on non-TCP packet")); tso->tcph_off = tso->nh_off + 4 * tso_iph(tso)->ip_hl; } else { KASSERT(tso->protocol == htons(ETHERTYPE_IPV6), ("TSO required on non-IP packet")); KASSERT(tso_ip6h(tso)->ip6_nxt == IPPROTO_TCP, ("TSO required on non-TCP packet")); tso->tcph_off = tso->nh_off + sizeof(struct ip6_hdr); } /* We assume all headers are linear in the head mbuf */ tso->header_len = tso->tcph_off + 4 * tso_tcph(tso)->th_off; KASSERT(tso->header_len <= mbuf->m_len, ("packet headers fragmented")); tso->full_packet_size = tso->header_len + mbuf->m_pkthdr.tso_segsz; tso->seqnum = ntohl(tso_tcph(tso)->th_seq); /* These flags must not be duplicated */ KASSERT(!(tso_tcph(tso)->th_flags & (TH_URG | TH_SYN | TH_RST)), ("incompatible TCP flag on TSO packet")); tso->out_len = mbuf->m_pkthdr.len - tso->header_len; } /* * tso_fill_packet_with_fragment - form descriptors for the current fragment * * Form descriptors for the current fragment, until we reach the end * of fragment or end-of-packet. Return 0 on success, 1 if not enough * space. */ static void tso_fill_packet_with_fragment(struct sfxge_txq *txq, struct sfxge_tso_state *tso) { efx_buffer_t *desc; int n; if (tso->in_len == 0 || tso->packet_space == 0) return; KASSERT(tso->in_len > 0, ("TSO input length went negative")); KASSERT(tso->packet_space > 0, ("TSO packet space went negative")); n = min(tso->in_len, tso->packet_space); tso->packet_space -= n; tso->out_len -= n; tso->in_len -= n; desc = &txq->pend_desc[txq->n_pend_desc++]; desc->eb_addr = tso->dma_addr; desc->eb_size = n; desc->eb_eop = tso->out_len == 0 || tso->packet_space == 0; tso->dma_addr += n; } /* Callback from bus_dmamap_load() for long TSO headers. */ static void tso_map_long_header(void *dma_addr_ret, bus_dma_segment_t *segs, int nseg, int error) { *(uint64_t *)dma_addr_ret = ((__predict_true(error == 0) && __predict_true(nseg == 1)) ? segs->ds_addr : 0); } /* * tso_start_new_packet - generate a new header and prepare for the new packet * * Generate a new header and prepare for the new packet. Return 0 on * success, or an error code if failed to alloc header. */ static int tso_start_new_packet(struct sfxge_txq *txq, struct sfxge_tso_state *tso, unsigned int id) { struct sfxge_tx_mapping *stmp = &txq->stmp[id]; struct tcphdr *tsoh_th; unsigned ip_length; caddr_t header; uint64_t dma_addr; bus_dmamap_t map; efx_buffer_t *desc; int rc; /* Allocate a DMA-mapped header buffer. */ if (__predict_true(tso->header_len <= TSOH_STD_SIZE)) { unsigned int page_index = (id / 2) / TSOH_PER_PAGE; unsigned int buf_index = (id / 2) % TSOH_PER_PAGE; header = (txq->tsoh_buffer[page_index].esm_base + buf_index * TSOH_STD_SIZE); dma_addr = (txq->tsoh_buffer[page_index].esm_addr + buf_index * TSOH_STD_SIZE); map = txq->tsoh_buffer[page_index].esm_map; stmp->flags = 0; } else { /* We cannot use bus_dmamem_alloc() as that may sleep */ header = malloc(tso->header_len, M_SFXGE, M_NOWAIT); if (__predict_false(!header)) return (ENOMEM); rc = bus_dmamap_load(txq->packet_dma_tag, stmp->map, header, tso->header_len, tso_map_long_header, &dma_addr, BUS_DMA_NOWAIT); if (__predict_false(dma_addr == 0)) { if (rc == 0) { /* Succeeded but got >1 segment */ bus_dmamap_unload(txq->packet_dma_tag, stmp->map); rc = EINVAL; } free(header, M_SFXGE); return (rc); } map = stmp->map; txq->tso_long_headers++; stmp->u.heap_buf = header; stmp->flags = TX_BUF_UNMAP; } tsoh_th = (struct tcphdr *)(header + tso->tcph_off); /* Copy and update the headers. */ memcpy(header, tso->mbuf->m_data, tso->header_len); tsoh_th->th_seq = htonl(tso->seqnum); tso->seqnum += tso->mbuf->m_pkthdr.tso_segsz; if (tso->out_len > tso->mbuf->m_pkthdr.tso_segsz) { /* This packet will not finish the TSO burst. */ ip_length = tso->full_packet_size - tso->nh_off; tsoh_th->th_flags &= ~(TH_FIN | TH_PUSH); } else { /* This packet will be the last in the TSO burst. */ ip_length = tso->header_len - tso->nh_off + tso->out_len; } if (tso->protocol == htons(ETHERTYPE_IP)) { struct ip *tsoh_iph = (struct ip *)(header + tso->nh_off); tsoh_iph->ip_len = htons(ip_length); /* XXX We should increment ip_id, but FreeBSD doesn't * currently allocate extra IDs for multiple segments. */ } else { struct ip6_hdr *tsoh_iph = (struct ip6_hdr *)(header + tso->nh_off); tsoh_iph->ip6_plen = htons(ip_length - sizeof(*tsoh_iph)); } /* Make the header visible to the hardware. */ bus_dmamap_sync(txq->packet_dma_tag, map, BUS_DMASYNC_PREWRITE); tso->packet_space = tso->mbuf->m_pkthdr.tso_segsz; txq->tso_packets++; /* Form a descriptor for this header. */ desc = &txq->pend_desc[txq->n_pend_desc++]; desc->eb_addr = dma_addr; desc->eb_size = tso->header_len; desc->eb_eop = 0; return (0); } static int sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf, const bus_dma_segment_t *dma_seg, int n_dma_seg) { struct sfxge_tso_state tso; unsigned int id, next_id; tso_start(&tso, mbuf); /* Grab the first payload fragment. */ if (dma_seg->ds_len == tso.header_len) { --n_dma_seg; KASSERT(n_dma_seg, ("no payload found in TSO packet")); ++dma_seg; tso.in_len = dma_seg->ds_len; tso.dma_addr = dma_seg->ds_addr; } else { tso.in_len = dma_seg->ds_len - tso.header_len; tso.dma_addr = dma_seg->ds_addr + tso.header_len; } id = txq->added & txq->ptr_mask; if (__predict_false(tso_start_new_packet(txq, &tso, id))) return (-1); while (1) { id = (id + 1) & txq->ptr_mask; tso_fill_packet_with_fragment(txq, &tso); /* Move onto the next fragment? */ if (tso.in_len == 0) { --n_dma_seg; if (n_dma_seg == 0) break; ++dma_seg; tso.in_len = dma_seg->ds_len; tso.dma_addr = dma_seg->ds_addr; } /* End of packet? */ if (tso.packet_space == 0) { /* If the queue is now full due to tiny MSS, * or we can't create another header, discard * the remainder of the input mbuf but do not * roll back the work we have done. */ if (txq->n_pend_desc > SFXGE_TSO_MAX_DESC - (1 + SFXGE_TX_MAPPING_MAX_SEG)) break; next_id = (id + 1) & txq->ptr_mask; if (__predict_false(tso_start_new_packet(txq, &tso, next_id))) break; id = next_id; } } txq->tso_bursts++; return (id); } static void sfxge_tx_qunblock(struct sfxge_txq *txq) { struct sfxge_softc *sc; struct sfxge_evq *evq; sc = txq->sc; evq = sc->evq[txq->evq_index]; mtx_assert(&evq->lock, MA_OWNED); if (txq->init_state != SFXGE_TXQ_STARTED) return; mtx_lock(SFXGE_TXQ_LOCK(txq)); if (txq->blocked) { unsigned int level; level = txq->added - txq->completed; if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries)) txq->blocked = 0; } sfxge_tx_qdpl_service(txq); /* note: lock has been dropped */ } void sfxge_tx_qflush_done(struct sfxge_txq *txq) { txq->flush_state = SFXGE_FLUSH_DONE; } static void sfxge_tx_qstop(struct sfxge_softc *sc, unsigned int index) { struct sfxge_txq *txq; struct sfxge_evq *evq; unsigned int count; txq = sc->txq[index]; evq = sc->evq[txq->evq_index]; mtx_lock(SFXGE_TXQ_LOCK(txq)); KASSERT(txq->init_state == SFXGE_TXQ_STARTED, ("txq->init_state != SFXGE_TXQ_STARTED")); txq->init_state = SFXGE_TXQ_INITIALIZED; txq->flush_state = SFXGE_FLUSH_PENDING; /* Flush the transmit queue. */ efx_tx_qflush(txq->common); mtx_unlock(SFXGE_TXQ_LOCK(txq)); count = 0; do { /* Spin for 100ms. */ DELAY(100000); if (txq->flush_state != SFXGE_FLUSH_PENDING) break; } while (++count < 20); mtx_lock(&evq->lock); mtx_lock(SFXGE_TXQ_LOCK(txq)); KASSERT(txq->flush_state != SFXGE_FLUSH_FAILED, ("txq->flush_state == SFXGE_FLUSH_FAILED")); txq->flush_state = SFXGE_FLUSH_DONE; txq->blocked = 0; txq->pending = txq->added; sfxge_tx_qcomplete(txq); KASSERT(txq->completed == txq->added, ("txq->completed != txq->added")); sfxge_tx_qreap(txq); KASSERT(txq->reaped == txq->completed, ("txq->reaped != txq->completed")); txq->added = 0; txq->pending = 0; txq->completed = 0; txq->reaped = 0; /* Destroy the common code transmit queue. */ efx_tx_qdestroy(txq->common); txq->common = NULL; efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id, EFX_TXQ_NBUFS(sc->txq_entries)); mtx_unlock(&evq->lock); mtx_unlock(SFXGE_TXQ_LOCK(txq)); } static int sfxge_tx_qstart(struct sfxge_softc *sc, unsigned int index) { struct sfxge_txq *txq; efsys_mem_t *esmp; uint16_t flags; struct sfxge_evq *evq; int rc; txq = sc->txq[index]; esmp = &txq->mem; evq = sc->evq[txq->evq_index]; KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED, ("txq->init_state != SFXGE_TXQ_INITIALIZED")); KASSERT(evq->init_state == SFXGE_EVQ_STARTED, ("evq->init_state != SFXGE_EVQ_STARTED")); /* Program the buffer table. */ if ((rc = efx_sram_buf_tbl_set(sc->enp, txq->buf_base_id, esmp, EFX_TXQ_NBUFS(sc->txq_entries))) != 0) return (rc); /* Determine the kind of queue we are creating. */ switch (txq->type) { case SFXGE_TXQ_NON_CKSUM: flags = 0; break; case SFXGE_TXQ_IP_CKSUM: flags = EFX_CKSUM_IPV4; break; case SFXGE_TXQ_IP_TCP_UDP_CKSUM: flags = EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP; break; default: KASSERT(0, ("Impossible TX queue")); flags = 0; break; } /* Create the common code transmit queue. */ if ((rc = efx_tx_qcreate(sc->enp, index, txq->type, esmp, sc->txq_entries, txq->buf_base_id, flags, evq->common, &txq->common)) != 0) goto fail; mtx_lock(SFXGE_TXQ_LOCK(txq)); /* Enable the transmit queue. */ efx_tx_qenable(txq->common); txq->init_state = SFXGE_TXQ_STARTED; mtx_unlock(SFXGE_TXQ_LOCK(txq)); return (0); fail: efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id, EFX_TXQ_NBUFS(sc->txq_entries)); return (rc); } void sfxge_tx_stop(struct sfxge_softc *sc) { const efx_nic_cfg_t *encp; int index; index = SFXGE_TX_SCALE(sc); while (--index >= 0) sfxge_tx_qstop(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index); sfxge_tx_qstop(sc, SFXGE_TXQ_IP_CKSUM); encp = efx_nic_cfg_get(sc->enp); sfxge_tx_qstop(sc, SFXGE_TXQ_NON_CKSUM); /* Tear down the transmit module */ efx_tx_fini(sc->enp); } int sfxge_tx_start(struct sfxge_softc *sc) { int index; int rc; /* Initialize the common code transmit module. */ if ((rc = efx_tx_init(sc->enp)) != 0) return (rc); if ((rc = sfxge_tx_qstart(sc, SFXGE_TXQ_NON_CKSUM)) != 0) goto fail; if ((rc = sfxge_tx_qstart(sc, SFXGE_TXQ_IP_CKSUM)) != 0) goto fail2; for (index = 0; index < SFXGE_TX_SCALE(sc); index++) { if ((rc = sfxge_tx_qstart(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index)) != 0) goto fail3; } return (0); fail3: while (--index >= 0) sfxge_tx_qstop(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index); sfxge_tx_qstop(sc, SFXGE_TXQ_IP_CKSUM); fail2: sfxge_tx_qstop(sc, SFXGE_TXQ_NON_CKSUM); fail: efx_tx_fini(sc->enp); return (rc); } /** * Destroy a transmit queue. */ static void sfxge_tx_qfini(struct sfxge_softc *sc, unsigned int index) { struct sfxge_txq *txq; unsigned int nmaps; txq = sc->txq[index]; KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED, ("txq->init_state != SFXGE_TXQ_INITIALIZED")); if (txq->type == SFXGE_TXQ_IP_TCP_UDP_CKSUM) tso_fini(txq); /* Free the context arrays. */ free(txq->pend_desc, M_SFXGE); nmaps = sc->txq_entries; while (nmaps-- != 0) bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map); free(txq->stmp, M_SFXGE); /* Release DMA memory mapping. */ sfxge_dma_free(&txq->mem); sc->txq[index] = NULL; #ifdef SFXGE_HAVE_MQ mtx_destroy(&txq->lock); #endif free(txq, M_SFXGE); } static int sfxge_tx_qinit(struct sfxge_softc *sc, unsigned int txq_index, enum sfxge_txq_type type, unsigned int evq_index) { char name[16]; struct sysctl_oid *txq_node; struct sfxge_txq *txq; struct sfxge_evq *evq; #ifdef SFXGE_HAVE_MQ struct sfxge_tx_dpl *stdp; #endif efsys_mem_t *esmp; unsigned int nmaps; int rc; txq = malloc(sizeof(struct sfxge_txq), M_SFXGE, M_ZERO | M_WAITOK); txq->sc = sc; txq->entries = sc->txq_entries; txq->ptr_mask = txq->entries - 1; sc->txq[txq_index] = txq; esmp = &txq->mem; evq = sc->evq[evq_index]; /* Allocate and zero DMA space for the descriptor ring. */ if ((rc = sfxge_dma_alloc(sc, EFX_TXQ_SIZE(sc->txq_entries), esmp)) != 0) return (rc); (void)memset(esmp->esm_base, 0, EFX_TXQ_SIZE(sc->txq_entries)); /* Allocate buffer table entries. */ sfxge_sram_buf_tbl_alloc(sc, EFX_TXQ_NBUFS(sc->txq_entries), &txq->buf_base_id); /* Create a DMA tag for packet mappings. */ if (bus_dma_tag_create(sc->parent_dma_tag, 1, 0x1000, MIN(0x3FFFFFFFFFFFUL, BUS_SPACE_MAXADDR), BUS_SPACE_MAXADDR, NULL, NULL, 0x11000, SFXGE_TX_MAPPING_MAX_SEG, 0x1000, 0, NULL, NULL, &txq->packet_dma_tag) != 0) { device_printf(sc->dev, "Couldn't allocate txq DMA tag\n"); rc = ENOMEM; goto fail; } /* Allocate pending descriptor array for batching writes. */ txq->pend_desc = malloc(sizeof(efx_buffer_t) * sc->txq_entries, M_SFXGE, M_ZERO | M_WAITOK); /* Allocate and initialise mbuf DMA mapping array. */ txq->stmp = malloc(sizeof(struct sfxge_tx_mapping) * sc->txq_entries, M_SFXGE, M_ZERO | M_WAITOK); for (nmaps = 0; nmaps < sc->txq_entries; nmaps++) { rc = bus_dmamap_create(txq->packet_dma_tag, 0, &txq->stmp[nmaps].map); if (rc != 0) goto fail2; } snprintf(name, sizeof(name), "%u", txq_index); txq_node = SYSCTL_ADD_NODE( device_get_sysctl_ctx(sc->dev), SYSCTL_CHILDREN(sc->txqs_node), OID_AUTO, name, CTLFLAG_RD, NULL, ""); if (txq_node == NULL) { rc = ENOMEM; goto fail_txq_node; } if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM && (rc = tso_init(txq)) != 0) goto fail3; #ifdef SFXGE_HAVE_MQ if (sfxge_tx_dpl_get_max <= 0) { log(LOG_ERR, "%s=%d must be greater than 0", SFXGE_PARAM_TX_DPL_GET_MAX, sfxge_tx_dpl_get_max); rc = EINVAL; goto fail_tx_dpl_get_max; } if (sfxge_tx_dpl_put_max < 0) { log(LOG_ERR, "%s=%d must be greater or equal to 0", SFXGE_PARAM_TX_DPL_PUT_MAX, sfxge_tx_dpl_put_max); rc = EINVAL; goto fail_tx_dpl_put_max; } /* Initialize the deferred packet list. */ stdp = &txq->dpl; stdp->std_put_max = sfxge_tx_dpl_put_max; stdp->std_get_max = sfxge_tx_dpl_get_max; stdp->std_getp = &stdp->std_get; mtx_init(&txq->lock, "txq", NULL, MTX_DEF); SYSCTL_ADD_UINT(device_get_sysctl_ctx(sc->dev), SYSCTL_CHILDREN(txq_node), OID_AUTO, "dpl_get_count", CTLFLAG_RD | CTLFLAG_STATS, &stdp->std_get_count, 0, ""); #endif txq->type = type; txq->evq_index = evq_index; txq->txq_index = txq_index; txq->init_state = SFXGE_TXQ_INITIALIZED; return (0); fail_tx_dpl_put_max: fail_tx_dpl_get_max: fail3: fail_txq_node: free(txq->pend_desc, M_SFXGE); fail2: while (nmaps-- != 0) bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map); free(txq->stmp, M_SFXGE); bus_dma_tag_destroy(txq->packet_dma_tag); fail: sfxge_dma_free(esmp); return (rc); } static const struct { const char *name; size_t offset; } sfxge_tx_stats[] = { #define SFXGE_TX_STAT(name, member) \ { #name, offsetof(struct sfxge_txq, member) } SFXGE_TX_STAT(tso_bursts, tso_bursts), SFXGE_TX_STAT(tso_packets, tso_packets), SFXGE_TX_STAT(tso_long_headers, tso_long_headers), SFXGE_TX_STAT(tx_collapses, collapses), SFXGE_TX_STAT(tx_drops, drops), SFXGE_TX_STAT(tx_early_drops, early_drops), }; static int sfxge_tx_stat_handler(SYSCTL_HANDLER_ARGS) { struct sfxge_softc *sc = arg1; unsigned int id = arg2; unsigned long sum; unsigned int index; /* Sum across all TX queues */ sum = 0; for (index = 0; index < SFXGE_TXQ_IP_TCP_UDP_CKSUM + SFXGE_TX_SCALE(sc); index++) sum += *(unsigned long *)((caddr_t)sc->txq[index] + sfxge_tx_stats[id].offset); return (SYSCTL_OUT(req, &sum, sizeof(sum))); } static void sfxge_tx_stat_init(struct sfxge_softc *sc) { struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); struct sysctl_oid_list *stat_list; unsigned int id; stat_list = SYSCTL_CHILDREN(sc->stats_node); for (id = 0; id < sizeof(sfxge_tx_stats) / sizeof(sfxge_tx_stats[0]); id++) { SYSCTL_ADD_PROC( ctx, stat_list, OID_AUTO, sfxge_tx_stats[id].name, CTLTYPE_ULONG|CTLFLAG_RD, sc, id, sfxge_tx_stat_handler, "LU", ""); } } void sfxge_tx_fini(struct sfxge_softc *sc) { int index; index = SFXGE_TX_SCALE(sc); while (--index >= 0) sfxge_tx_qfini(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index); sfxge_tx_qfini(sc, SFXGE_TXQ_IP_CKSUM); sfxge_tx_qfini(sc, SFXGE_TXQ_NON_CKSUM); } int sfxge_tx_init(struct sfxge_softc *sc) { struct sfxge_intr *intr; int index; int rc; intr = &sc->intr; KASSERT(intr->state == SFXGE_INTR_INITIALIZED, ("intr->state != SFXGE_INTR_INITIALIZED")); sc->txqs_node = SYSCTL_ADD_NODE( device_get_sysctl_ctx(sc->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "txq", CTLFLAG_RD, NULL, "Tx queues"); if (sc->txqs_node == NULL) { rc = ENOMEM; goto fail_txq_node; } /* Initialize the transmit queues */ if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_NON_CKSUM, SFXGE_TXQ_NON_CKSUM, 0)) != 0) goto fail; if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_IP_CKSUM, SFXGE_TXQ_IP_CKSUM, 0)) != 0) goto fail2; for (index = 0; index < SFXGE_TX_SCALE(sc); index++) { if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index, SFXGE_TXQ_IP_TCP_UDP_CKSUM, index)) != 0) goto fail3; } sfxge_tx_stat_init(sc); return (0); fail3: sfxge_tx_qfini(sc, SFXGE_TXQ_IP_CKSUM); while (--index >= 0) sfxge_tx_qfini(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index); fail2: sfxge_tx_qfini(sc, SFXGE_TXQ_NON_CKSUM); fail: fail_txq_node: return (rc); } Index: head/sys/dev/virtio/network/if_vtnet.c =================================================================== --- head/sys/dev/virtio/network/if_vtnet.c (revision 275357) +++ head/sys/dev/virtio/network/if_vtnet.c (revision 275358) @@ -1,3948 +1,3949 @@ /*- * Copyright (c) 2011, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver for VirtIO network devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_if.h" #include "opt_inet.h" #include "opt_inet6.h" static int vtnet_modevent(module_t, int, void *); static int vtnet_probe(device_t); static int vtnet_attach(device_t); static int vtnet_detach(device_t); static int vtnet_suspend(device_t); static int vtnet_resume(device_t); static int vtnet_shutdown(device_t); static int vtnet_attach_completed(device_t); static int vtnet_config_change(device_t); static void vtnet_negotiate_features(struct vtnet_softc *); static void vtnet_setup_features(struct vtnet_softc *); static int vtnet_init_rxq(struct vtnet_softc *, int); static int vtnet_init_txq(struct vtnet_softc *, int); static int vtnet_alloc_rxtx_queues(struct vtnet_softc *); static void vtnet_free_rxtx_queues(struct vtnet_softc *); static int vtnet_alloc_rx_filters(struct vtnet_softc *); static void vtnet_free_rx_filters(struct vtnet_softc *); static int vtnet_alloc_virtqueues(struct vtnet_softc *); static int vtnet_setup_interface(struct vtnet_softc *); static int vtnet_change_mtu(struct vtnet_softc *, int); static int vtnet_ioctl(struct ifnet *, u_long, caddr_t); static uint64_t vtnet_get_counter(struct ifnet *, ift_counter); static int vtnet_rxq_populate(struct vtnet_rxq *); static void vtnet_rxq_free_mbufs(struct vtnet_rxq *); static struct mbuf * vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **); static int vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *, struct mbuf *, int); static int vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int); static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *); static int vtnet_rxq_new_buf(struct vtnet_rxq *); static int vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int); static void vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *); static int vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int); static void vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); static int vtnet_rxq_eof(struct vtnet_rxq *); static void vtnet_rx_vq_intr(void *); static void vtnet_rxq_tq_intr(void *, int); static int vtnet_txq_below_threshold(struct vtnet_txq *); static int vtnet_txq_notify(struct vtnet_txq *); static void vtnet_txq_free_mbufs(struct vtnet_txq *); static int vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *, int *, int *, int *); static int vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int, int, struct virtio_net_hdr *); static struct mbuf * vtnet_txq_offload(struct vtnet_txq *, struct mbuf *, struct virtio_net_hdr *); static int vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **, struct vtnet_tx_header *); static int vtnet_txq_encap(struct vtnet_txq *, struct mbuf **); #ifdef VTNET_LEGACY_TX static void vtnet_start_locked(struct vtnet_txq *, struct ifnet *); static void vtnet_start(struct ifnet *); #else static int vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *); static int vtnet_txq_mq_start(struct ifnet *, struct mbuf *); static void vtnet_txq_tq_deferred(void *, int); #endif static void vtnet_txq_start(struct vtnet_txq *); static void vtnet_txq_tq_intr(void *, int); static int vtnet_txq_eof(struct vtnet_txq *); static void vtnet_tx_vq_intr(void *); static void vtnet_tx_start_all(struct vtnet_softc *); #ifndef VTNET_LEGACY_TX static void vtnet_qflush(struct ifnet *); #endif static int vtnet_watchdog(struct vtnet_txq *); static void vtnet_accum_stats(struct vtnet_softc *, struct vtnet_rxq_stats *, struct vtnet_txq_stats *); static void vtnet_tick(void *); static void vtnet_start_taskqueues(struct vtnet_softc *); static void vtnet_free_taskqueues(struct vtnet_softc *); static void vtnet_drain_taskqueues(struct vtnet_softc *); static void vtnet_drain_rxtx_queues(struct vtnet_softc *); static void vtnet_stop_rendezvous(struct vtnet_softc *); static void vtnet_stop(struct vtnet_softc *); static int vtnet_virtio_reinit(struct vtnet_softc *); static void vtnet_init_rx_filters(struct vtnet_softc *); static int vtnet_init_rx_queues(struct vtnet_softc *); static int vtnet_init_tx_queues(struct vtnet_softc *); static int vtnet_init_rxtx_queues(struct vtnet_softc *); static void vtnet_set_active_vq_pairs(struct vtnet_softc *); static int vtnet_reinit(struct vtnet_softc *); static void vtnet_init_locked(struct vtnet_softc *); static void vtnet_init(void *); static void vtnet_free_ctrl_vq(struct vtnet_softc *); static void vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *, struct sglist *, int, int); static int vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *); static int vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t); static int vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int); static int vtnet_set_promisc(struct vtnet_softc *, int); static int vtnet_set_allmulti(struct vtnet_softc *, int); static void vtnet_attach_disable_promisc(struct vtnet_softc *); static void vtnet_rx_filter(struct vtnet_softc *); static void vtnet_rx_filter_mac(struct vtnet_softc *); static int vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_rx_filter_vlan(struct vtnet_softc *); static void vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_register_vlan(void *, struct ifnet *, uint16_t); static void vtnet_unregister_vlan(void *, struct ifnet *, uint16_t); static int vtnet_is_link_up(struct vtnet_softc *); static void vtnet_update_link_status(struct vtnet_softc *); static int vtnet_ifmedia_upd(struct ifnet *); static void vtnet_ifmedia_sts(struct ifnet *, struct ifmediareq *); static void vtnet_get_hwaddr(struct vtnet_softc *); static void vtnet_set_hwaddr(struct vtnet_softc *); static void vtnet_vlan_tag_remove(struct mbuf *); static void vtnet_set_rx_process_limit(struct vtnet_softc *); static void vtnet_set_tx_intr_threshold(struct vtnet_softc *); static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *, struct sysctl_oid_list *, struct vtnet_rxq *); static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *, struct sysctl_oid_list *, struct vtnet_txq *); static void vtnet_setup_queue_sysctl(struct vtnet_softc *); static void vtnet_setup_sysctl(struct vtnet_softc *); static int vtnet_rxq_enable_intr(struct vtnet_rxq *); static void vtnet_rxq_disable_intr(struct vtnet_rxq *); static int vtnet_txq_enable_intr(struct vtnet_txq *); static void vtnet_txq_disable_intr(struct vtnet_txq *); static void vtnet_enable_rx_interrupts(struct vtnet_softc *); static void vtnet_enable_tx_interrupts(struct vtnet_softc *); static void vtnet_enable_interrupts(struct vtnet_softc *); static void vtnet_disable_rx_interrupts(struct vtnet_softc *); static void vtnet_disable_tx_interrupts(struct vtnet_softc *); static void vtnet_disable_interrupts(struct vtnet_softc *); static int vtnet_tunable_int(struct vtnet_softc *, const char *, int); /* Tunables. */ static int vtnet_csum_disable = 0; TUNABLE_INT("hw.vtnet.csum_disable", &vtnet_csum_disable); static int vtnet_tso_disable = 0; TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable); static int vtnet_lro_disable = 0; TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable); static int vtnet_mq_disable = 0; TUNABLE_INT("hw.vtnet.mq_disable", &vtnet_mq_disable); static int vtnet_mq_max_pairs = 0; TUNABLE_INT("hw.vtnet.mq_max_pairs", &vtnet_mq_max_pairs); static int vtnet_rx_process_limit = 512; TUNABLE_INT("hw.vtnet.rx_process_limit", &vtnet_rx_process_limit); static uma_zone_t vtnet_tx_header_zone; static struct virtio_feature_desc vtnet_feature_desc[] = { { VIRTIO_NET_F_CSUM, "TxChecksum" }, { VIRTIO_NET_F_GUEST_CSUM, "RxChecksum" }, { VIRTIO_NET_F_MAC, "MacAddress" }, { VIRTIO_NET_F_GSO, "TxAllGSO" }, { VIRTIO_NET_F_GUEST_TSO4, "RxTSOv4" }, { VIRTIO_NET_F_GUEST_TSO6, "RxTSOv6" }, { VIRTIO_NET_F_GUEST_ECN, "RxECN" }, { VIRTIO_NET_F_GUEST_UFO, "RxUFO" }, { VIRTIO_NET_F_HOST_TSO4, "TxTSOv4" }, { VIRTIO_NET_F_HOST_TSO6, "TxTSOv6" }, { VIRTIO_NET_F_HOST_ECN, "TxTSOECN" }, { VIRTIO_NET_F_HOST_UFO, "TxUFO" }, { VIRTIO_NET_F_MRG_RXBUF, "MrgRxBuf" }, { VIRTIO_NET_F_STATUS, "Status" }, { VIRTIO_NET_F_CTRL_VQ, "ControlVq" }, { VIRTIO_NET_F_CTRL_RX, "RxMode" }, { VIRTIO_NET_F_CTRL_VLAN, "VLanFilter" }, { VIRTIO_NET_F_CTRL_RX_EXTRA, "RxModeExtra" }, { VIRTIO_NET_F_GUEST_ANNOUNCE, "GuestAnnounce" }, { VIRTIO_NET_F_MQ, "Multiqueue" }, { VIRTIO_NET_F_CTRL_MAC_ADDR, "SetMacAddress" }, { 0, NULL } }; static device_method_t vtnet_methods[] = { /* Device methods. */ DEVMETHOD(device_probe, vtnet_probe), DEVMETHOD(device_attach, vtnet_attach), DEVMETHOD(device_detach, vtnet_detach), DEVMETHOD(device_suspend, vtnet_suspend), DEVMETHOD(device_resume, vtnet_resume), DEVMETHOD(device_shutdown, vtnet_shutdown), /* VirtIO methods. */ DEVMETHOD(virtio_attach_completed, vtnet_attach_completed), DEVMETHOD(virtio_config_change, vtnet_config_change), DEVMETHOD_END }; #ifdef DEV_NETMAP #include #endif /* DEV_NETMAP */ static driver_t vtnet_driver = { "vtnet", vtnet_methods, sizeof(struct vtnet_softc) }; static devclass_t vtnet_devclass; DRIVER_MODULE(vtnet, virtio_pci, vtnet_driver, vtnet_devclass, vtnet_modevent, 0); MODULE_VERSION(vtnet, 1); MODULE_DEPEND(vtnet, virtio, 1, 1, 1); static int vtnet_modevent(module_t mod, int type, void *unused) { int error; error = 0; switch (type) { case MOD_LOAD: vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr", sizeof(struct vtnet_tx_header), NULL, NULL, NULL, NULL, 0, 0); break; case MOD_QUIESCE: case MOD_UNLOAD: if (uma_zone_get_cur(vtnet_tx_header_zone) > 0) error = EBUSY; else if (type == MOD_UNLOAD) { uma_zdestroy(vtnet_tx_header_zone); vtnet_tx_header_zone = NULL; } break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static int vtnet_probe(device_t dev) { if (virtio_get_device_type(dev) != VIRTIO_ID_NETWORK) return (ENXIO); device_set_desc(dev, "VirtIO Networking Adapter"); return (BUS_PROBE_DEFAULT); } static int vtnet_attach(device_t dev) { struct vtnet_softc *sc; int error; sc = device_get_softc(dev); sc->vtnet_dev = dev; /* Register our feature descriptions. */ virtio_set_feature_desc(dev, vtnet_feature_desc); VTNET_CORE_LOCK_INIT(sc); callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0); vtnet_setup_sysctl(sc); vtnet_setup_features(sc); error = vtnet_alloc_rx_filters(sc); if (error) { device_printf(dev, "cannot allocate Rx filters\n"); goto fail; } error = vtnet_alloc_rxtx_queues(sc); if (error) { device_printf(dev, "cannot allocate queues\n"); goto fail; } error = vtnet_alloc_virtqueues(sc); if (error) { device_printf(dev, "cannot allocate virtqueues\n"); goto fail; } error = vtnet_setup_interface(sc); if (error) { device_printf(dev, "cannot setup interface\n"); goto fail; } error = virtio_setup_intr(dev, INTR_TYPE_NET); if (error) { device_printf(dev, "cannot setup virtqueue interrupts\n"); /* BMV: This will crash if during boot! */ ether_ifdetach(sc->vtnet_ifp); goto fail; } #ifdef DEV_NETMAP vtnet_netmap_attach(sc); #endif /* DEV_NETMAP */ vtnet_start_taskqueues(sc); fail: if (error) vtnet_detach(dev); return (error); } static int vtnet_detach(device_t dev) { struct vtnet_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->vtnet_ifp; if (device_is_attached(dev)) { VTNET_CORE_LOCK(sc); vtnet_stop(sc); VTNET_CORE_UNLOCK(sc); callout_drain(&sc->vtnet_tick_ch); vtnet_drain_taskqueues(sc); ether_ifdetach(ifp); } #ifdef DEV_NETMAP netmap_detach(ifp); #endif /* DEV_NETMAP */ vtnet_free_taskqueues(sc); if (sc->vtnet_vlan_attach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach); sc->vtnet_vlan_attach = NULL; } if (sc->vtnet_vlan_detach != NULL) { EVENTHANDLER_DEREGISTER(vlan_unconfg, sc->vtnet_vlan_detach); sc->vtnet_vlan_detach = NULL; } ifmedia_removeall(&sc->vtnet_media); if (ifp != NULL) { if_free(ifp); sc->vtnet_ifp = NULL; } vtnet_free_rxtx_queues(sc); vtnet_free_rx_filters(sc); if (sc->vtnet_ctrl_vq != NULL) vtnet_free_ctrl_vq(sc); VTNET_CORE_LOCK_DESTROY(sc); return (0); } static int vtnet_suspend(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); VTNET_CORE_LOCK(sc); vtnet_stop(sc); sc->vtnet_flags |= VTNET_FLAG_SUSPENDED; VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_resume(device_t dev) { struct vtnet_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->vtnet_ifp; VTNET_CORE_LOCK(sc); if (ifp->if_flags & IFF_UP) vtnet_init_locked(sc); sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED; VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_shutdown(device_t dev) { /* * Suspend already does all of what we need to * do here; we just never expect to be resumed. */ return (vtnet_suspend(dev)); } static int vtnet_attach_completed(device_t dev) { vtnet_attach_disable_promisc(device_get_softc(dev)); return (0); } static int vtnet_config_change(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); VTNET_CORE_LOCK(sc); vtnet_update_link_status(sc); if (sc->vtnet_link_active != 0) vtnet_tx_start_all(sc); VTNET_CORE_UNLOCK(sc); return (0); } static void vtnet_negotiate_features(struct vtnet_softc *sc) { device_t dev; uint64_t mask, features; dev = sc->vtnet_dev; mask = 0; /* * TSO and LRO are only available when their corresponding checksum * offload feature is also negotiated. */ if (vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable)) { mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM; mask |= VTNET_TSO_FEATURES | VTNET_LRO_FEATURES; } if (vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable)) mask |= VTNET_TSO_FEATURES; if (vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable)) mask |= VTNET_LRO_FEATURES; #ifndef VTNET_LEGACY_TX if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable)) mask |= VIRTIO_NET_F_MQ; #else mask |= VIRTIO_NET_F_MQ; #endif features = VTNET_FEATURES & ~mask; sc->vtnet_features = virtio_negotiate_features(dev, features); if (virtio_with_feature(dev, VTNET_LRO_FEATURES) && virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0) { /* * LRO without mergeable buffers requires special care. This * is not ideal because every receive buffer must be large * enough to hold the maximum TCP packet, the Ethernet header, * and the header. This requires up to 34 descriptors with * MCLBYTES clusters. If we do not have indirect descriptors, * LRO is disabled since the virtqueue will not contain very * many receive buffers. */ if (!virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) { device_printf(dev, "LRO disabled due to both mergeable buffers and " "indirect descriptors not negotiated\n"); features &= ~VTNET_LRO_FEATURES; sc->vtnet_features = virtio_negotiate_features(dev, features); } else sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG; } } static void vtnet_setup_features(struct vtnet_softc *sc) { device_t dev; int max_pairs, max; dev = sc->vtnet_dev; vtnet_negotiate_features(sc); if (virtio_with_feature(dev, VIRTIO_RING_F_EVENT_IDX)) sc->vtnet_flags |= VTNET_FLAG_EVENT_IDX; if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) { /* This feature should always be negotiated. */ sc->vtnet_flags |= VTNET_FLAG_MAC; } if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) { sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS; sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); } else sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr); if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) sc->vtnet_rx_nsegs = VTNET_MRG_RX_SEGS; else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) sc->vtnet_rx_nsegs = VTNET_MAX_RX_SEGS; else sc->vtnet_rx_nsegs = VTNET_MIN_RX_SEGS; if (virtio_with_feature(dev, VIRTIO_NET_F_GSO) || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4) || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) sc->vtnet_tx_nsegs = VTNET_MAX_TX_SEGS; else sc->vtnet_tx_nsegs = VTNET_MIN_TX_SEGS; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) { sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX)) sc->vtnet_flags |= VTNET_FLAG_CTRL_RX; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN)) sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR)) sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC; } if (virtio_with_feature(dev, VIRTIO_NET_F_MQ) && sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { max_pairs = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, max_virtqueue_pairs)); if (max_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || max_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) max_pairs = 1; } else max_pairs = 1; if (max_pairs > 1) { /* * Limit the maximum number of queue pairs to the number of * CPUs or the configured maximum. The actual number of * queues that get used may be less. */ max = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs); if (max > 0 && max_pairs > max) max_pairs = max; if (max_pairs > mp_ncpus) max_pairs = mp_ncpus; if (max_pairs > VTNET_MAX_QUEUE_PAIRS) max_pairs = VTNET_MAX_QUEUE_PAIRS; if (max_pairs > 1) sc->vtnet_flags |= VTNET_FLAG_MULTIQ; } sc->vtnet_max_vq_pairs = max_pairs; } static int vtnet_init_rxq(struct vtnet_softc *sc, int id) { struct vtnet_rxq *rxq; rxq = &sc->vtnet_rxqs[id]; snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d", device_get_nameunit(sc->vtnet_dev), id); mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF); rxq->vtnrx_sc = sc; rxq->vtnrx_id = id; rxq->vtnrx_sg = sglist_alloc(sc->vtnet_rx_nsegs, M_NOWAIT); if (rxq->vtnrx_sg == NULL) return (ENOMEM); TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq); rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT, taskqueue_thread_enqueue, &rxq->vtnrx_tq); return (rxq->vtnrx_tq == NULL ? ENOMEM : 0); } static int vtnet_init_txq(struct vtnet_softc *sc, int id) { struct vtnet_txq *txq; txq = &sc->vtnet_txqs[id]; snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d", device_get_nameunit(sc->vtnet_dev), id); mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF); txq->vtntx_sc = sc; txq->vtntx_id = id; txq->vtntx_sg = sglist_alloc(sc->vtnet_tx_nsegs, M_NOWAIT); if (txq->vtntx_sg == NULL) return (ENOMEM); #ifndef VTNET_LEGACY_TX txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF, M_NOWAIT, &txq->vtntx_mtx); if (txq->vtntx_br == NULL) return (ENOMEM); TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq); #endif TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq); txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT, taskqueue_thread_enqueue, &txq->vtntx_tq); if (txq->vtntx_tq == NULL) return (ENOMEM); return (0); } static int vtnet_alloc_rxtx_queues(struct vtnet_softc *sc) { int i, npairs, error; npairs = sc->vtnet_max_vq_pairs; sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL) return (ENOMEM); for (i = 0; i < npairs; i++) { error = vtnet_init_rxq(sc, i); if (error) return (error); error = vtnet_init_txq(sc, i); if (error) return (error); } vtnet_setup_queue_sysctl(sc); return (0); } static void vtnet_destroy_rxq(struct vtnet_rxq *rxq) { rxq->vtnrx_sc = NULL; rxq->vtnrx_id = -1; if (rxq->vtnrx_sg != NULL) { sglist_free(rxq->vtnrx_sg); rxq->vtnrx_sg = NULL; } if (mtx_initialized(&rxq->vtnrx_mtx) != 0) mtx_destroy(&rxq->vtnrx_mtx); } static void vtnet_destroy_txq(struct vtnet_txq *txq) { txq->vtntx_sc = NULL; txq->vtntx_id = -1; if (txq->vtntx_sg != NULL) { sglist_free(txq->vtntx_sg); txq->vtntx_sg = NULL; } #ifndef VTNET_LEGACY_TX if (txq->vtntx_br != NULL) { buf_ring_free(txq->vtntx_br, M_DEVBUF); txq->vtntx_br = NULL; } #endif if (mtx_initialized(&txq->vtntx_mtx) != 0) mtx_destroy(&txq->vtntx_mtx); } static void vtnet_free_rxtx_queues(struct vtnet_softc *sc) { int i; if (sc->vtnet_rxqs != NULL) { for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_destroy_rxq(&sc->vtnet_rxqs[i]); free(sc->vtnet_rxqs, M_DEVBUF); sc->vtnet_rxqs = NULL; } if (sc->vtnet_txqs != NULL) { for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_destroy_txq(&sc->vtnet_txqs[i]); free(sc->vtnet_txqs, M_DEVBUF); sc->vtnet_txqs = NULL; } } static int vtnet_alloc_rx_filters(struct vtnet_softc *sc) { if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_mac_filter == NULL) return (ENOMEM); } if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) * VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_vlan_filter == NULL) return (ENOMEM); } return (0); } static void vtnet_free_rx_filters(struct vtnet_softc *sc) { if (sc->vtnet_mac_filter != NULL) { free(sc->vtnet_mac_filter, M_DEVBUF); sc->vtnet_mac_filter = NULL; } if (sc->vtnet_vlan_filter != NULL) { free(sc->vtnet_vlan_filter, M_DEVBUF); sc->vtnet_vlan_filter = NULL; } } static int vtnet_alloc_virtqueues(struct vtnet_softc *sc) { device_t dev; struct vq_alloc_info *info; struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i, idx, flags, nvqs, error; dev = sc->vtnet_dev; flags = 0; nvqs = sc->vtnet_max_vq_pairs * 2; if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) nvqs++; info = malloc(sizeof(struct vq_alloc_info) * nvqs, M_TEMP, M_NOWAIT); if (info == NULL) return (ENOMEM); for (i = 0, idx = 0; i < sc->vtnet_max_vq_pairs; i++, idx+=2) { rxq = &sc->vtnet_rxqs[i]; VQ_ALLOC_INFO_INIT(&info[idx], sc->vtnet_rx_nsegs, vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq, "%s-%d rx", device_get_nameunit(dev), rxq->vtnrx_id); txq = &sc->vtnet_txqs[i]; VQ_ALLOC_INFO_INIT(&info[idx+1], sc->vtnet_tx_nsegs, vtnet_tx_vq_intr, txq, &txq->vtntx_vq, "%s-%d tx", device_get_nameunit(dev), txq->vtntx_id); } if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL, &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev)); } /* * Enable interrupt binding if this is multiqueue. This only matters * when per-vq MSIX is available. */ if (sc->vtnet_flags & VTNET_FLAG_MULTIQ) flags |= 0; error = virtio_alloc_virtqueues(dev, flags, nvqs, info); free(info, M_TEMP); return (error); } static int vtnet_setup_interface(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "cannot allocate ifnet structure\n"); return (ENOSPC); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); /* Approx. */ ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = vtnet_init; ifp->if_ioctl = vtnet_ioctl; ifp->if_get_counter = vtnet_get_counter; #ifndef VTNET_LEGACY_TX ifp->if_transmit = vtnet_txq_mq_start; ifp->if_qflush = vtnet_qflush; #else struct virtqueue *vq = sc->vtnet_txqs[0].vtntx_vq; ifp->if_start = vtnet_start; IFQ_SET_MAXLEN(&ifp->if_snd, virtqueue_size(vq) - 1); ifp->if_snd.ifq_drv_maxlen = virtqueue_size(vq) - 1; IFQ_SET_READY(&ifp->if_snd); #endif ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd, vtnet_ifmedia_sts); ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL); ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE); /* Read (or generate) the MAC address for the adapter. */ vtnet_get_hwaddr(sc); ether_ifattach(ifp, sc->vtnet_hwaddr); if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS)) ifp->if_capabilities |= IFCAP_LINKSTATE; /* Tell the upper layer(s) we support long frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) { ifp->if_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6; if (virtio_with_feature(dev, VIRTIO_NET_F_GSO)) { ifp->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6; sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; } else { if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4)) ifp->if_capabilities |= IFCAP_TSO4; if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) ifp->if_capabilities |= IFCAP_TSO6; if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN)) sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; } if (ifp->if_capabilities & IFCAP_TSO) ifp->if_capabilities |= IFCAP_VLAN_HWTSO; } if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) { ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6; if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) || virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6)) ifp->if_capabilities |= IFCAP_LRO; } if (ifp->if_capabilities & IFCAP_HWCSUM) { /* * VirtIO does not support VLAN tagging, but we can fake * it by inserting and removing the 802.1Q header during * transmit and receive. We are then able to do checksum * offloading of VLAN frames. */ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; } ifp->if_capenable = ifp->if_capabilities; /* * Capabilities after here are not enabled by default. */ if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); } vtnet_set_rx_process_limit(sc); vtnet_set_tx_intr_threshold(sc); return (0); } static int vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu) { struct ifnet *ifp; int frame_size, clsize; ifp = sc->vtnet_ifp; if (new_mtu < ETHERMIN || new_mtu > VTNET_MAX_MTU) return (EINVAL); frame_size = sc->vtnet_hdr_size + sizeof(struct ether_vlan_header) + new_mtu; /* * Based on the new MTU (and hence frame size) determine which * cluster size is most appropriate for the receive queues. */ if (frame_size <= MCLBYTES) { clsize = MCLBYTES; } else if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { /* Avoid going past 9K jumbos. */ if (frame_size > MJUM9BYTES) return (EINVAL); clsize = MJUM9BYTES; } else clsize = MJUMPAGESIZE; ifp->if_mtu = new_mtu; sc->vtnet_rx_new_clsize = clsize; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vtnet_init_locked(sc); } return (0); } static int vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct vtnet_softc *sc; struct ifreq *ifr; int reinit, mask, error; sc = ifp->if_softc; ifr = (struct ifreq *) data; error = 0; switch (cmd) { case SIOCSIFMTU: if (ifp->if_mtu != ifr->ifr_mtu) { VTNET_CORE_LOCK(sc); error = vtnet_change_mtu(sc, ifr->ifr_mtu); VTNET_CORE_UNLOCK(sc); } break; case SIOCSIFFLAGS: VTNET_CORE_LOCK(sc); if ((ifp->if_flags & IFF_UP) == 0) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vtnet_stop(sc); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if ((ifp->if_flags ^ sc->vtnet_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) vtnet_rx_filter(sc); else error = ENOTSUP; } } else vtnet_init_locked(sc); if (error == 0) sc->vtnet_if_flags = ifp->if_flags; VTNET_CORE_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) break; VTNET_CORE_LOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) vtnet_rx_filter_mac(sc); VTNET_CORE_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd); break; case SIOCSIFCAP: VTNET_CORE_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) ifp->if_capenable ^= IFCAP_TXCSUM; if (mask & IFCAP_TXCSUM_IPV6) ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO | IFCAP_VLAN_HWFILTER)) { /* These Rx features require us to renegotiate. */ reinit = 1; if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWFILTER) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; } else reinit = 0; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vtnet_init_locked(sc); } VTNET_CORE_UNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, cmd, data); break; } VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc); return (error); } static int vtnet_rxq_populate(struct vtnet_rxq *rxq) { struct virtqueue *vq; int nbufs, error; vq = rxq->vtnrx_vq; error = ENOSPC; for (nbufs = 0; !virtqueue_full(vq); nbufs++) { error = vtnet_rxq_new_buf(rxq); if (error) break; } if (nbufs > 0) { virtqueue_notify(vq); /* * EMSGSIZE signifies the virtqueue did not have enough * entries available to hold the last mbuf. This is not * an error. */ if (error == EMSGSIZE) error = 0; } return (error); } static void vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq) { struct virtqueue *vq; struct mbuf *m; int last; vq = rxq->vtnrx_vq; last = 0; while ((m = virtqueue_drain(vq, &last)) != NULL) m_freem(m); KASSERT(virtqueue_empty(vq), ("%s: mbufs remaining in rx queue %p", __func__, rxq)); } static struct mbuf * vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp) { struct mbuf *m_head, *m_tail, *m; int i, clsize; clsize = sc->vtnet_rx_clsize; KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, ("%s: chained mbuf %d request without LRO_NOMRG", __func__, nbufs)); m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, clsize); if (m_head == NULL) goto fail; m_head->m_len = clsize; m_tail = m_head; /* Allocate the rest of the chain. */ for (i = 1; i < nbufs; i++) { m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize); if (m == NULL) goto fail; m->m_len = clsize; m_tail->m_next = m; m_tail = m; } if (m_tailp != NULL) *m_tailp = m_tail; return (m_head); fail: sc->vtnet_stats.mbuf_alloc_failed++; m_freem(m_head); return (NULL); } /* * Slow path for when LRO without mergeable buffers is negotiated. */ static int vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *rxq, struct mbuf *m0, int len0) { struct vtnet_softc *sc; struct mbuf *m, *m_prev; struct mbuf *m_new, *m_tail; int len, clsize, nreplace, error; sc = rxq->vtnrx_sc; clsize = sc->vtnet_rx_clsize; m_prev = NULL; m_tail = NULL; nreplace = 0; m = m0; len = len0; /* * Since these mbuf chains are so large, we avoid allocating an * entire replacement chain if possible. When the received frame * did not consume the entire chain, the unused mbufs are moved * to the replacement chain. */ while (len > 0) { /* * Something is seriously wrong if we received a frame * larger than the chain. Drop it. */ if (m == NULL) { sc->vtnet_stats.rx_frame_too_large++; return (EMSGSIZE); } /* We always allocate the same cluster size. */ KASSERT(m->m_len == clsize, ("%s: mbuf size %d is not the cluster size %d", __func__, m->m_len, clsize)); m->m_len = MIN(m->m_len, len); len -= m->m_len; m_prev = m; m = m->m_next; nreplace++; } KASSERT(nreplace <= sc->vtnet_rx_nmbufs, ("%s: too many replacement mbufs %d max %d", __func__, nreplace, sc->vtnet_rx_nmbufs)); m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail); if (m_new == NULL) { m_prev->m_len = clsize; return (ENOBUFS); } /* * Move any unused mbufs from the received chain onto the end * of the new chain. */ if (m_prev->m_next != NULL) { m_tail->m_next = m_prev->m_next; m_prev->m_next = NULL; } error = vtnet_rxq_enqueue_buf(rxq, m_new); if (error) { /* * BAD! We could not enqueue the replacement mbuf chain. We * must restore the m0 chain to the original state if it was * modified so we can subsequently discard it. * * NOTE: The replacement is suppose to be an identical copy * to the one just dequeued so this is an unexpected error. */ sc->vtnet_stats.rx_enq_replacement_failed++; if (m_tail->m_next != NULL) { m_prev->m_next = m_tail->m_next; m_tail->m_next = NULL; } m_prev->m_len = clsize; m_freem(m_new); } return (error); } static int vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len) { struct vtnet_softc *sc; struct mbuf *m_new; int error; sc = rxq->vtnrx_sc; KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL, ("%s: chained mbuf without LRO_NOMRG", __func__)); if (m->m_next == NULL) { /* Fast-path for the common case of just one mbuf. */ if (m->m_len < len) return (EINVAL); m_new = vtnet_rx_alloc_buf(sc, 1, NULL); if (m_new == NULL) return (ENOBUFS); error = vtnet_rxq_enqueue_buf(rxq, m_new); if (error) { /* * The new mbuf is suppose to be an identical * copy of the one just dequeued so this is an * unexpected error. */ m_freem(m_new); sc->vtnet_stats.rx_enq_replacement_failed++; } else m->m_len = len; } else error = vtnet_rxq_replace_lro_nomgr_buf(rxq, m, len); return (error); } static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m) { struct vtnet_softc *sc; struct sglist *sg; struct vtnet_rx_header *rxhdr; uint8_t *mdata; int offset, error; sc = rxq->vtnrx_sc; sg = rxq->vtnrx_sg; mdata = mtod(m, uint8_t *); VTNET_RXQ_LOCK_ASSERT(rxq); KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL, ("%s: chained mbuf without LRO_NOMRG", __func__)); KASSERT(m->m_len == sc->vtnet_rx_clsize, ("%s: unexpected cluster size %d/%d", __func__, m->m_len, sc->vtnet_rx_clsize)); sglist_reset(sg); if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr)); rxhdr = (struct vtnet_rx_header *) mdata; sglist_append(sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size); offset = sizeof(struct vtnet_rx_header); } else offset = 0; sglist_append(sg, mdata + offset, m->m_len - offset); if (m->m_next != NULL) { error = sglist_append_mbuf(sg, m->m_next); MPASS(error == 0); } error = virtqueue_enqueue(rxq->vtnrx_vq, m, sg, 0, sg->sg_nseg); return (error); } static int vtnet_rxq_new_buf(struct vtnet_rxq *rxq) { struct vtnet_softc *sc; struct mbuf *m; int error; sc = rxq->vtnrx_sc; m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL); if (m == NULL) return (ENOBUFS); error = vtnet_rxq_enqueue_buf(rxq, m); if (error) m_freem(m); return (error); } /* * Use the checksum offset in the VirtIO header to set the * correct CSUM_* flags. */ static int vtnet_rxq_csum_by_offset(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; #if defined(INET) || defined(INET6) int offset = hdr->csum_start + hdr->csum_offset; #endif sc = rxq->vtnrx_sc; /* Only do a basic sanity check on the offset. */ switch (eth_type) { #if defined(INET) case ETHERTYPE_IP: if (__predict_false(offset < ip_start + sizeof(struct ip))) return (1); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr))) return (1); break; #endif default: sc->vtnet_stats.rx_csum_bad_ethtype++; return (1); } /* * Use the offset to determine the appropriate CSUM_* flags. This is * a bit dirty, but we can get by with it since the checksum offsets * happen to be different. We assume the host host does not do IPv4 * header checksum offloading. */ switch (hdr->csum_offset) { case offsetof(struct udphdr, uh_sum): case offsetof(struct tcphdr, th_sum): m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; case offsetof(struct sctphdr, checksum): m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; break; default: sc->vtnet_stats.rx_csum_bad_offset++; return (1); } return (0); } static int vtnet_rxq_csum_by_parse(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; int offset, proto; sc = rxq->vtnrx_sc; switch (eth_type) { #if defined(INET) case ETHERTYPE_IP: { struct ip *ip; if (__predict_false(m->m_len < ip_start + sizeof(struct ip))) return (1); ip = (struct ip *)(m->m_data + ip_start); proto = ip->ip_p; offset = ip_start + (ip->ip_hl << 2); break; } #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(m->m_len < ip_start + sizeof(struct ip6_hdr))) return (1); offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto); if (__predict_false(offset < 0)) return (1); break; #endif default: sc->vtnet_stats.rx_csum_bad_ethtype++; return (1); } switch (proto) { case IPPROTO_TCP: if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; case IPPROTO_UDP: if (__predict_false(m->m_len < offset + sizeof(struct udphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; case IPPROTO_SCTP: if (__predict_false(m->m_len < offset + sizeof(struct sctphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; break; default: /* * For the remaining protocols, FreeBSD does not support * checksum offloading, so the checksum will be recomputed. */ #if 0 if_printf(sc->vtnet_ifp, "cksum offload of unsupported " "protocol eth_type=%#x proto=%d csum_start=%d " "csum_offset=%d\n", __func__, eth_type, proto, hdr->csum_start, hdr->csum_offset); #endif break; } return (0); } /* * Set the appropriate CSUM_* flags. Unfortunately, the information * provided is not directly useful to us. The VirtIO header gives the * offset of the checksum, which is all Linux needs, but this is not * how FreeBSD does things. We are forced to peek inside the packet * a bit. * * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD * could accept the offsets and let the stack figure it out. */ static int vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct ether_header *eh; struct ether_vlan_header *evh; uint16_t eth_type; int offset, error; eh = mtod(m, struct ether_header *); eth_type = ntohs(eh->ether_type); if (eth_type == ETHERTYPE_VLAN) { /* BMV: We should handle nested VLAN tags too. */ evh = mtod(m, struct ether_vlan_header *); eth_type = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else offset = sizeof(struct ether_header); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) error = vtnet_rxq_csum_by_offset(rxq, m, eth_type, offset, hdr); else error = vtnet_rxq_csum_by_parse(rxq, m, eth_type, offset, hdr); return (error); } static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs) { struct mbuf *m; while (--nbufs > 0) { m = virtqueue_dequeue(rxq->vtnrx_vq, NULL); if (m == NULL) break; vtnet_rxq_discard_buf(rxq, m); } } static void vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m) { int error; /* * Requeue the discarded mbuf. This should always be successful * since it was just dequeued. */ error = vtnet_rxq_enqueue_buf(rxq, m); KASSERT(error == 0, ("%s: cannot requeue discarded mbuf %d", __func__, error)); } static int vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs) { struct vtnet_softc *sc; struct ifnet *ifp; struct virtqueue *vq; struct mbuf *m, *m_tail; int len; sc = rxq->vtnrx_sc; vq = rxq->vtnrx_vq; ifp = sc->vtnet_ifp; m_tail = m_head; while (--nbufs > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) { rxq->vtnrx_stats.vrxs_ierrors++; goto fail; } if (vtnet_rxq_new_buf(rxq) != 0) { rxq->vtnrx_stats.vrxs_iqdrops++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); goto fail; } if (m->m_len < len) len = m->m_len; m->m_len = len; m->m_flags &= ~M_PKTHDR; m_head->m_pkthdr.len += len; m_tail->m_next = m; m_tail = m; } return (0); fail: sc->vtnet_stats.rx_mergeable_failed++; m_freem(m_head); return (1); } static void vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; struct ifnet *ifp; struct ether_header *eh; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { eh = mtod(m, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { vtnet_vlan_tag_remove(m); /* * With the 802.1Q header removed, update the * checksum starting location accordingly. */ if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) hdr->csum_start -= ETHER_VLAN_ENCAP_LEN; } } m->m_pkthdr.flowid = rxq->vtnrx_id; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); /* * BMV: FreeBSD does not have the UNNECESSARY and PARTIAL checksum * distinction that Linux does. Need to reevaluate if performing * offloading for the NEEDS_CSUM case is really appropriate. */ if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM | VIRTIO_NET_HDR_F_DATA_VALID)) { if (vtnet_rxq_csum(rxq, m, hdr) == 0) rxq->vtnrx_stats.vrxs_csum++; else rxq->vtnrx_stats.vrxs_csum_failed++; } rxq->vtnrx_stats.vrxs_ipackets++; rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len; VTNET_RXQ_UNLOCK(rxq); (*ifp->if_input)(ifp, m); VTNET_RXQ_LOCK(rxq); } static int vtnet_rxq_eof(struct vtnet_rxq *rxq) { struct virtio_net_hdr lhdr, *hdr; struct vtnet_softc *sc; struct ifnet *ifp; struct virtqueue *vq; struct mbuf *m; struct virtio_net_hdr_mrg_rxbuf *mhdr; int len, deq, nbufs, adjsz, count; sc = rxq->vtnrx_sc; vq = rxq->vtnrx_vq; ifp = sc->vtnet_ifp; hdr = &lhdr; deq = 0; count = sc->vtnet_rx_process_limit; VTNET_RXQ_LOCK_ASSERT(rxq); #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, 0, &deq)) { return (FALSE); } #endif /* DEV_NETMAP */ while (count-- > 0) { m = virtqueue_dequeue(vq, &len); if (m == NULL) break; deq++; if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) { rxq->vtnrx_stats.vrxs_ierrors++; vtnet_rxq_discard_buf(rxq, m); continue; } if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { nbufs = 1; adjsz = sizeof(struct vtnet_rx_header); /* * Account for our pad inserted between the header * and the actual start of the frame. */ len += VTNET_RX_HEADER_PAD; } else { mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *); nbufs = mhdr->num_buffers; adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf); } if (vtnet_rxq_replace_buf(rxq, m, len) != 0) { rxq->vtnrx_stats.vrxs_iqdrops++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); continue; } m->m_pkthdr.len = len; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags = 0; if (nbufs > 1) { /* Dequeue the rest of chain. */ if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0) continue; } /* * Save copy of header before we strip it. For both mergeable * and non-mergeable, the header is at the beginning of the * mbuf data. We no longer need num_buffers, so always use a * regular header. * * BMV: Is this memcpy() expensive? We know the mbuf data is * still valid even after the m_adj(). */ memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr)); m_adj(m, adjsz); vtnet_rxq_input(rxq, m, hdr); /* Must recheck after dropping the Rx lock. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (deq > 0) virtqueue_notify(vq); return (count > 0 ? 0 : EAGAIN); } static void vtnet_rx_vq_intr(void *xrxq) { struct vtnet_softc *sc; struct vtnet_rxq *rxq; struct ifnet *ifp; int tries, more; rxq = xrxq; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; tries = 0; if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) { /* * Ignore this interrupt. Either this is a spurious interrupt * or multiqueue without per-VQ MSIX so every queue needs to * be polled (a brain dead configuration we could try harder * to avoid). */ vtnet_rxq_disable_intr(rxq); return; } VTNET_RXQ_LOCK(rxq); again: if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_RXQ_UNLOCK(rxq); return; } more = vtnet_rxq_eof(rxq); if (more || vtnet_rxq_enable_intr(rxq) != 0) { if (!more) vtnet_rxq_disable_intr(rxq); /* * This is an occasional condition or race (when !more), * so retry a few times before scheduling the taskqueue. */ if (tries++ < VTNET_INTR_DISABLE_RETRIES) goto again; VTNET_RXQ_UNLOCK(rxq); rxq->vtnrx_stats.vrxs_rescheduled++; taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } else VTNET_RXQ_UNLOCK(rxq); } static void vtnet_rxq_tq_intr(void *xrxq, int pending) { struct vtnet_softc *sc; struct vtnet_rxq *rxq; struct ifnet *ifp; int more; rxq = xrxq; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; VTNET_RXQ_LOCK(rxq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_RXQ_UNLOCK(rxq); return; } more = vtnet_rxq_eof(rxq); if (more || vtnet_rxq_enable_intr(rxq) != 0) { if (!more) vtnet_rxq_disable_intr(rxq); rxq->vtnrx_stats.vrxs_rescheduled++; taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } VTNET_RXQ_UNLOCK(rxq); } static int vtnet_txq_below_threshold(struct vtnet_txq *txq) { struct vtnet_softc *sc; struct virtqueue *vq; sc = txq->vtntx_sc; vq = txq->vtntx_vq; return (virtqueue_nfree(vq) <= sc->vtnet_tx_intr_thresh); } static int vtnet_txq_notify(struct vtnet_txq *txq) { struct virtqueue *vq; vq = txq->vtntx_vq; txq->vtntx_watchdog = VTNET_TX_TIMEOUT; virtqueue_notify(vq); if (vtnet_txq_enable_intr(txq) == 0) return (0); /* * Drain frames that were completed since last checked. If this * causes the queue to go above the threshold, the caller should * continue transmitting. */ if (vtnet_txq_eof(txq) != 0 && vtnet_txq_below_threshold(txq) == 0) { virtqueue_disable_intr(vq); return (1); } return (0); } static void vtnet_txq_free_mbufs(struct vtnet_txq *txq) { struct virtqueue *vq; struct vtnet_tx_header *txhdr; int last; vq = txq->vtntx_vq; last = 0; while ((txhdr = virtqueue_drain(vq, &last)) != NULL) { m_freem(txhdr->vth_mbuf); uma_zfree(vtnet_tx_header_zone, txhdr); } KASSERT(virtqueue_empty(vq), ("%s: mbufs remaining in tx queue %p", __func__, txq)); } /* * BMV: Much of this can go away once we finally have offsets in * the mbuf packet header. Bug andre@. */ static int vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m, int *etype, int *proto, int *start) { struct vtnet_softc *sc; struct ether_vlan_header *evh; int offset; sc = txq->vtntx_sc; evh = mtod(m, struct ether_vlan_header *); if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { /* BMV: We should handle nested VLAN tags too. */ *etype = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else { *etype = ntohs(evh->evl_encap_proto); offset = sizeof(struct ether_header); } switch (*etype) { #if defined(INET) case ETHERTYPE_IP: { struct ip *ip, iphdr; if (__predict_false(m->m_len < offset + sizeof(struct ip))) { m_copydata(m, offset, sizeof(struct ip), (caddr_t) &iphdr); ip = &iphdr; } else ip = (struct ip *)(m->m_data + offset); *proto = ip->ip_p; *start = offset + (ip->ip_hl << 2); break; } #endif #if defined(INET6) case ETHERTYPE_IPV6: *proto = -1; *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); /* Assert the network stack sent us a valid packet. */ KASSERT(*start > offset, ("%s: mbuf %p start %d offset %d proto %d", __func__, m, *start, offset, *proto)); break; #endif default: sc->vtnet_stats.tx_csum_bad_ethtype++; return (EINVAL); } return (0); } static int vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type, int offset, struct virtio_net_hdr *hdr) { static struct timeval lastecn; static int curecn; struct vtnet_softc *sc; struct tcphdr *tcp, tcphdr; sc = txq->vtntx_sc; if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); tcp = &tcphdr; } else tcp = (struct tcphdr *)(m->m_data + offset); hdr->hdr_len = offset + (tcp->th_off << 2); hdr->gso_size = m->m_pkthdr.tso_segsz; hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : VIRTIO_NET_HDR_GSO_TCPV6; if (tcp->th_flags & TH_CWR) { /* * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD, * ECN support is not on a per-interface basis, but globally via * the net.inet.tcp.ecn.enable sysctl knob. The default is off. */ if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { if (ppsratecheck(&lastecn, &curecn, 1)) if_printf(sc->vtnet_ifp, "TSO with ECN not negotiated with host\n"); return (ENOTSUP); } hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } txq->vtntx_stats.vtxs_tso++; return (0); } static struct mbuf * vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; int flags, etype, csum_start, proto, error; sc = txq->vtntx_sc; flags = m->m_pkthdr.csum_flags; error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start); if (error) goto drop; if ((etype == ETHERTYPE_IP && flags & VTNET_CSUM_OFFLOAD) || (etype == ETHERTYPE_IPV6 && flags & VTNET_CSUM_OFFLOAD_IPV6)) { /* * We could compare the IP protocol vs the CSUM_ flag too, * but that really should not be necessary. */ hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = csum_start; hdr->csum_offset = m->m_pkthdr.csum_data; txq->vtntx_stats.vtxs_csum++; } if (flags & CSUM_TSO) { if (__predict_false(proto != IPPROTO_TCP)) { /* Likely failed to correctly parse the mbuf. */ sc->vtnet_stats.tx_tso_not_tcp++; goto drop; } KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM, ("%s: mbuf %p TSO without checksum offload %#x", __func__, m, flags)); error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr); if (error) goto drop; } return (m); drop: m_freem(m); return (NULL); } static int vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head, struct vtnet_tx_header *txhdr) { struct vtnet_softc *sc; struct virtqueue *vq; struct sglist *sg; struct mbuf *m; int error; sc = txq->vtntx_sc; vq = txq->vtntx_vq; sg = txq->vtntx_sg; m = *m_head; sglist_reset(sg); error = sglist_append(sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size); KASSERT(error == 0 && sg->sg_nseg == 1, ("%s: error %d adding header to sglist", __func__, error)); error = sglist_append_mbuf(sg, m); if (error) { m = m_defrag(m, M_NOWAIT); if (m == NULL) goto fail; *m_head = m; sc->vtnet_stats.tx_defragged++; error = sglist_append_mbuf(sg, m); if (error) goto fail; } txhdr->vth_mbuf = m; error = virtqueue_enqueue(vq, txhdr, sg, sg->sg_nseg, 0); return (error); fail: sc->vtnet_stats.tx_defrag_failed++; m_freem(*m_head); *m_head = NULL; return (ENOBUFS); } static int vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head) { struct vtnet_tx_header *txhdr; struct virtio_net_hdr *hdr; struct mbuf *m; int error; m = *m_head; M_ASSERTPKTHDR(m); txhdr = uma_zalloc(vtnet_tx_header_zone, M_NOWAIT | M_ZERO); if (txhdr == NULL) { m_freem(m); *m_head = NULL; return (ENOMEM); } /* * Always use the non-mergeable header, regardless if the feature * was negotiated. For transmit, num_buffers is always zero. The * vtnet_hdr_size is used to enqueue the correct header size. */ hdr = &txhdr->vth_uhdr.hdr; if (m->m_flags & M_VLANTAG) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if ((*m_head = m) == NULL) { error = ENOBUFS; goto fail; } m->m_flags &= ~M_VLANTAG; } if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) { m = vtnet_txq_offload(txq, m, hdr); if ((*m_head = m) == NULL) { error = ENOBUFS; goto fail; } } error = vtnet_txq_enqueue_buf(txq, m_head, txhdr); if (error == 0) return (0); fail: uma_zfree(vtnet_tx_header_zone, txhdr); return (error); } #ifdef VTNET_LEGACY_TX static void vtnet_start_locked(struct vtnet_txq *txq, struct ifnet *ifp) { struct vtnet_softc *sc; struct virtqueue *vq; struct mbuf *m0; int tries, enq; sc = txq->vtntx_sc; vq = txq->vtntx_vq; tries = 0; VTNET_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vtnet_link_active == 0) return; vtnet_txq_eof(txq); again: enq = 0; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if (virtqueue_full(vq)) break; IFQ_DRV_DEQUEUE(&ifp->if_snd, m0); if (m0 == NULL) break; if (vtnet_txq_encap(txq, &m0) != 0) { if (m0 != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m0); break; } enq++; ETHER_BPF_MTAP(ifp, m0); } if (enq > 0 && vtnet_txq_notify(txq) != 0) { if (tries++ < VTNET_NOTIFY_RETRIES) goto again; txq->vtntx_stats.vtxs_rescheduled++; taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); } } static void vtnet_start(struct ifnet *ifp) { struct vtnet_softc *sc; struct vtnet_txq *txq; sc = ifp->if_softc; txq = &sc->vtnet_txqs[0]; VTNET_TXQ_LOCK(txq); vtnet_start_locked(txq, ifp); VTNET_TXQ_UNLOCK(txq); } #else /* !VTNET_LEGACY_TX */ static int vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m) { struct vtnet_softc *sc; struct virtqueue *vq; struct buf_ring *br; struct ifnet *ifp; int enq, tries, error; sc = txq->vtntx_sc; vq = txq->vtntx_vq; br = txq->vtntx_br; ifp = sc->vtnet_ifp; tries = 0; error = 0; VTNET_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vtnet_link_active == 0) { if (m != NULL) error = drbr_enqueue(ifp, br, m); return (error); } if (m != NULL) { error = drbr_enqueue(ifp, br, m); if (error) return (error); } vtnet_txq_eof(txq); again: enq = 0; while ((m = drbr_peek(ifp, br)) != NULL) { if (virtqueue_full(vq)) { drbr_putback(ifp, br, m); break; } if (vtnet_txq_encap(txq, &m) != 0) { if (m != NULL) drbr_putback(ifp, br, m); else drbr_advance(ifp, br); break; } drbr_advance(ifp, br); enq++; ETHER_BPF_MTAP(ifp, m); } if (enq > 0 && vtnet_txq_notify(txq) != 0) { if (tries++ < VTNET_NOTIFY_RETRIES) goto again; txq->vtntx_stats.vtxs_rescheduled++; taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); } return (0); } static int vtnet_txq_mq_start(struct ifnet *ifp, struct mbuf *m) { struct vtnet_softc *sc; struct vtnet_txq *txq; int i, npairs, error; sc = ifp->if_softc; npairs = sc->vtnet_act_vq_pairs; - if (m->m_flags & M_FLOWID) + /* check if flowid is set */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % npairs; else i = curcpu % npairs; txq = &sc->vtnet_txqs[i]; if (VTNET_TXQ_TRYLOCK(txq) != 0) { error = vtnet_txq_mq_start_locked(txq, m); VTNET_TXQ_UNLOCK(txq); } else { error = drbr_enqueue(ifp, txq->vtntx_br, m); taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask); } return (error); } static void vtnet_txq_tq_deferred(void *xtxq, int pending) { struct vtnet_softc *sc; struct vtnet_txq *txq; txq = xtxq; sc = txq->vtntx_sc; VTNET_TXQ_LOCK(txq); if (!drbr_empty(sc->vtnet_ifp, txq->vtntx_br)) vtnet_txq_mq_start_locked(txq, NULL); VTNET_TXQ_UNLOCK(txq); } #endif /* VTNET_LEGACY_TX */ static void vtnet_txq_start(struct vtnet_txq *txq) { struct vtnet_softc *sc; struct ifnet *ifp; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; #ifdef VTNET_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) vtnet_start_locked(txq, ifp); #else if (!drbr_empty(ifp, txq->vtntx_br)) vtnet_txq_mq_start_locked(txq, NULL); #endif } static void vtnet_txq_tq_intr(void *xtxq, int pending) { struct vtnet_softc *sc; struct vtnet_txq *txq; struct ifnet *ifp; txq = xtxq; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; VTNET_TXQ_LOCK(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_TXQ_UNLOCK(txq); return; } vtnet_txq_eof(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } static int vtnet_txq_eof(struct vtnet_txq *txq) { struct virtqueue *vq; struct vtnet_tx_header *txhdr; struct mbuf *m; int deq; vq = txq->vtntx_vq; deq = 0; VTNET_TXQ_LOCK_ASSERT(txq); #ifdef DEV_NETMAP if (netmap_tx_irq(txq->vtntx_sc->vtnet_ifp, txq->vtntx_id)) { virtqueue_disable_intr(vq); // XXX luigi return 0; // XXX or 1 ? } #endif /* DEV_NETMAP */ while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) { m = txhdr->vth_mbuf; deq++; txq->vtntx_stats.vtxs_opackets++; txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) txq->vtntx_stats.vtxs_omcasts++; m_freem(m); uma_zfree(vtnet_tx_header_zone, txhdr); } if (virtqueue_empty(vq)) txq->vtntx_watchdog = 0; return (deq); } static void vtnet_tx_vq_intr(void *xtxq) { struct vtnet_softc *sc; struct vtnet_txq *txq; struct ifnet *ifp; txq = xtxq; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) { /* * Ignore this interrupt. Either this is a spurious interrupt * or multiqueue without per-VQ MSIX so every queue needs to * be polled (a brain dead configuration we could try harder * to avoid). */ vtnet_txq_disable_intr(txq); return; } VTNET_TXQ_LOCK(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VTNET_TXQ_UNLOCK(txq); return; } vtnet_txq_eof(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } static void vtnet_tx_start_all(struct vtnet_softc *sc) { struct vtnet_txq *txq; int i; VTNET_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } } #ifndef VTNET_LEGACY_TX static void vtnet_qflush(struct ifnet *ifp) { struct vtnet_softc *sc; struct vtnet_txq *txq; struct mbuf *m; int i; sc = ifp->if_softc; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL) m_freem(m); VTNET_TXQ_UNLOCK(txq); } if_qflush(ifp); } #endif static int vtnet_watchdog(struct vtnet_txq *txq) { struct ifnet *ifp; ifp = txq->vtntx_sc->vtnet_ifp; VTNET_TXQ_LOCK(txq); if (txq->vtntx_watchdog == 1) { /* * Only drain completed frames if the watchdog is about to * expire. If any frames were drained, there may be enough * free descriptors now available to transmit queued frames. * In that case, the timer will immediately be decremented * below, but the timeout is generous enough that should not * be a problem. */ if (vtnet_txq_eof(txq) != 0) vtnet_txq_start(txq); } if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) { VTNET_TXQ_UNLOCK(txq); return (0); } VTNET_TXQ_UNLOCK(txq); if_printf(ifp, "watchdog timeout on queue %d\n", txq->vtntx_id); return (1); } static void vtnet_accum_stats(struct vtnet_softc *sc, struct vtnet_rxq_stats *rxacc, struct vtnet_txq_stats *txacc) { bzero(rxacc, sizeof(struct vtnet_rxq_stats)); bzero(txacc, sizeof(struct vtnet_txq_stats)); for (int i = 0; i < sc->vtnet_max_vq_pairs; i++) { struct vtnet_rxq_stats *rxst; struct vtnet_txq_stats *txst; rxst = &sc->vtnet_rxqs[i].vtnrx_stats; rxacc->vrxs_ipackets += rxst->vrxs_ipackets; rxacc->vrxs_ibytes += rxst->vrxs_ibytes; rxacc->vrxs_iqdrops += rxst->vrxs_iqdrops; rxacc->vrxs_csum += rxst->vrxs_csum; rxacc->vrxs_csum_failed += rxst->vrxs_csum_failed; rxacc->vrxs_rescheduled += rxst->vrxs_rescheduled; txst = &sc->vtnet_txqs[i].vtntx_stats; txacc->vtxs_opackets += txst->vtxs_opackets; txacc->vtxs_obytes += txst->vtxs_obytes; txacc->vtxs_csum += txst->vtxs_csum; txacc->vtxs_tso += txst->vtxs_tso; txacc->vtxs_rescheduled += txst->vtxs_rescheduled; } } static uint64_t vtnet_get_counter(if_t ifp, ift_counter cnt) { struct vtnet_softc *sc; struct vtnet_rxq_stats rxaccum; struct vtnet_txq_stats txaccum; sc = if_getsoftc(ifp); vtnet_accum_stats(sc, &rxaccum, &txaccum); switch (cnt) { case IFCOUNTER_IPACKETS: return (rxaccum.vrxs_ipackets); case IFCOUNTER_IQDROPS: return (rxaccum.vrxs_iqdrops); case IFCOUNTER_IERRORS: return (rxaccum.vrxs_ierrors); case IFCOUNTER_OPACKETS: return (txaccum.vtxs_opackets); #ifndef VTNET_LEGACY_TX case IFCOUNTER_OBYTES: return (txaccum.vtxs_obytes); case IFCOUNTER_OMCASTS: return (txaccum.vtxs_omcasts); #endif default: return (if_get_counter_default(ifp, cnt)); } } static void vtnet_tick(void *xsc) { struct vtnet_softc *sc; struct ifnet *ifp; int i, timedout; sc = xsc; ifp = sc->vtnet_ifp; timedout = 0; VTNET_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]); if (timedout != 0) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vtnet_init_locked(sc); } else callout_schedule(&sc->vtnet_tick_ch, hz); } static void vtnet_start_taskqueues(struct vtnet_softc *sc) { device_t dev; struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i, error; dev = sc->vtnet_dev; /* * Errors here are very difficult to recover from - we cannot * easily fail because, if this is during boot, we will hang * when freeing any successfully started taskqueues because * the scheduler isn't up yet. * * Most drivers just ignore the return value - it only fails * with ENOMEM so an error is not likely. */ for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id); if (error) { device_printf(dev, "failed to start rx taskq %d\n", rxq->vtnrx_id); } txq = &sc->vtnet_txqs[i]; error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET, "%s txq %d", device_get_nameunit(dev), txq->vtntx_id); if (error) { device_printf(dev, "failed to start tx taskq %d\n", txq->vtntx_id); } } } static void vtnet_free_taskqueues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; if (rxq->vtnrx_tq != NULL) { taskqueue_free(rxq->vtnrx_tq); rxq->vtnrx_vq = NULL; } txq = &sc->vtnet_txqs[i]; if (txq->vtntx_tq != NULL) { taskqueue_free(txq->vtntx_tq); txq->vtntx_tq = NULL; } } } static void vtnet_drain_taskqueues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; if (rxq->vtnrx_tq != NULL) taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); txq = &sc->vtnet_txqs[i]; if (txq->vtntx_tq != NULL) { taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask); #ifndef VTNET_LEGACY_TX taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask); #endif } } } static void vtnet_drain_rxtx_queues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; vtnet_rxq_free_mbufs(rxq); txq = &sc->vtnet_txqs[i]; vtnet_txq_free_mbufs(txq); } } static void vtnet_stop_rendezvous(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; /* * Lock and unlock the per-queue mutex so we known the stop * state is visible. Doing only the active queues should be * sufficient, but it does not cost much extra to do all the * queues. Note we hold the core mutex here too. */ for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; VTNET_RXQ_LOCK(rxq); VTNET_RXQ_UNLOCK(rxq); txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); VTNET_TXQ_UNLOCK(txq); } } static void vtnet_stop(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; sc->vtnet_link_active = 0; callout_stop(&sc->vtnet_tick_ch); /* Only advisory. */ vtnet_disable_interrupts(sc); /* * Stop the host adapter. This resets it to the pre-initialized * state. It will not generate any interrupts until after it is * reinitialized. */ virtio_stop(dev); vtnet_stop_rendezvous(sc); /* Free any mbufs left in the virtqueues. */ vtnet_drain_rxtx_queues(sc); } static int vtnet_virtio_reinit(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; uint64_t features; int mask, error; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; features = sc->vtnet_features; mask = 0; #if defined(INET) mask |= IFCAP_RXCSUM; #endif #if defined (INET6) mask |= IFCAP_RXCSUM_IPV6; #endif /* * Re-negotiate with the host, removing any disabled receive * features. Transmit features are disabled only on our side * via if_capenable and if_hwassist. */ if (ifp->if_capabilities & mask) { /* * We require both IPv4 and IPv6 offloading to be enabled * in order to negotiated it: VirtIO does not distinguish * between the two. */ if ((ifp->if_capenable & mask) != mask) features &= ~VIRTIO_NET_F_GUEST_CSUM; } if (ifp->if_capabilities & IFCAP_LRO) { if ((ifp->if_capenable & IFCAP_LRO) == 0) features &= ~VTNET_LRO_FEATURES; } if (ifp->if_capabilities & IFCAP_VLAN_HWFILTER) { if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0) features &= ~VIRTIO_NET_F_CTRL_VLAN; } error = virtio_reinit(dev, features); if (error) device_printf(dev, "virtio reinit error %d\n", error); return (error); } static void vtnet_init_rx_filters(struct vtnet_softc *sc) { struct ifnet *ifp; ifp = sc->vtnet_ifp; if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { /* Restore promiscuous and all-multicast modes. */ vtnet_rx_filter(sc); /* Restore filtered MAC addresses. */ vtnet_rx_filter_mac(sc); } if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) vtnet_rx_filter_vlan(sc); } static int vtnet_init_rx_queues(struct vtnet_softc *sc) { device_t dev; struct vtnet_rxq *rxq; int i, clsize, error; dev = sc->vtnet_dev; /* * Use the new cluster size if one has been set (via a MTU * change). Otherwise, use the standard 2K clusters. * * BMV: It might make sense to use page sized clusters as * the default (depending on the features negotiated). */ if (sc->vtnet_rx_new_clsize != 0) { clsize = sc->vtnet_rx_new_clsize; sc->vtnet_rx_new_clsize = 0; } else clsize = MCLBYTES; sc->vtnet_rx_clsize = clsize; sc->vtnet_rx_nmbufs = VTNET_NEEDED_RX_MBUFS(sc, clsize); KASSERT(sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS || sc->vtnet_rx_nmbufs < sc->vtnet_rx_nsegs, ("%s: too many rx mbufs %d for %d segments", __func__, sc->vtnet_rx_nmbufs, sc->vtnet_rx_nsegs)); #ifdef DEV_NETMAP if (vtnet_netmap_init_rx_buffers(sc)) return 0; #endif /* DEV_NETMAP */ for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; /* Hold the lock to satisfy asserts. */ VTNET_RXQ_LOCK(rxq); error = vtnet_rxq_populate(rxq); VTNET_RXQ_UNLOCK(rxq); if (error) { device_printf(dev, "cannot allocate mbufs for Rx queue %d\n", i); return (error); } } return (0); } static int vtnet_init_tx_queues(struct vtnet_softc *sc) { struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; txq->vtntx_watchdog = 0; } return (0); } static int vtnet_init_rxtx_queues(struct vtnet_softc *sc) { int error; error = vtnet_init_rx_queues(sc); if (error) return (error); error = vtnet_init_tx_queues(sc); if (error) return (error); return (0); } static void vtnet_set_active_vq_pairs(struct vtnet_softc *sc) { device_t dev; int npairs; dev = sc->vtnet_dev; if ((sc->vtnet_flags & VTNET_FLAG_MULTIQ) == 0) { MPASS(sc->vtnet_max_vq_pairs == 1); sc->vtnet_act_vq_pairs = 1; return; } /* BMV: Just use the maximum configured for now. */ npairs = sc->vtnet_max_vq_pairs; if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) { device_printf(dev, "cannot set active queue pairs to %d\n", npairs); npairs = 1; } sc->vtnet_act_vq_pairs = npairs; } static int vtnet_reinit(struct vtnet_softc *sc) { struct ifnet *ifp; int error; ifp = sc->vtnet_ifp; /* Use the current MAC address. */ bcopy(IF_LLADDR(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN); vtnet_set_hwaddr(sc); vtnet_set_active_vq_pairs(sc); ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= VTNET_CSUM_OFFLOAD; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= VTNET_CSUM_OFFLOAD_IPV6; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_TSO; /* No CSUM_TSO_IPV6. */ if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) vtnet_init_rx_filters(sc); error = vtnet_init_rxtx_queues(sc); if (error) return (error); vtnet_enable_interrupts(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; return (0); } static void vtnet_init_locked(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; vtnet_stop(sc); /* Reinitialize with the host. */ if (vtnet_virtio_reinit(sc) != 0) goto fail; if (vtnet_reinit(sc) != 0) goto fail; virtio_reinit_complete(dev); vtnet_update_link_status(sc); callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc); return; fail: vtnet_stop(sc); } static void vtnet_init(void *xsc) { struct vtnet_softc *sc; sc = xsc; #ifdef DEV_NETMAP if (!NA(sc->vtnet_ifp)) { D("try to attach again"); vtnet_netmap_attach(sc); } #endif /* DEV_NETMAP */ VTNET_CORE_LOCK(sc); vtnet_init_locked(sc); VTNET_CORE_UNLOCK(sc); } static void vtnet_free_ctrl_vq(struct vtnet_softc *sc) { struct virtqueue *vq; vq = sc->vtnet_ctrl_vq; /* * The control virtqueue is only polled and therefore it should * already be empty. */ KASSERT(virtqueue_empty(vq), ("%s: ctrl vq %p not empty", __func__, vq)); } static void vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie, struct sglist *sg, int readable, int writable) { struct virtqueue *vq; vq = sc->vtnet_ctrl_vq; VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ, ("%s: CTRL_VQ feature not negotiated", __func__)); if (!virtqueue_empty(vq)) return; if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0) return; /* * Poll for the response, but the command is likely already * done when we return from the notify. */ virtqueue_notify(vq); virtqueue_poll(vq, NULL); } static int vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr) { struct virtio_net_ctrl_hdr hdr __aligned(2); struct sglist_seg segs[3]; struct sglist sg; uint8_t ack; int error; hdr.class = VIRTIO_NET_CTRL_MAC; hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET; ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, hwaddr, ETHER_ADDR_LEN); error |= sglist_append(&sg, &ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding set MAC msg to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); return (ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr; uint8_t pad1; struct virtio_net_ctrl_mq mq; uint8_t pad2; uint8_t ack; } s __aligned(2); int error; s.hdr.class = VIRTIO_NET_CTRL_MQ; s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET; s.mq.virtqueue_pairs = npairs; s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding MQ message to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr; uint8_t pad1; uint8_t onoff; uint8_t pad2; uint8_t ack; } s __aligned(2); int error; KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, ("%s: CTRL_RX feature not negotiated", __func__)); s.hdr.class = VIRTIO_NET_CTRL_RX; s.hdr.cmd = cmd; s.onoff = !!on; s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding Rx message to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_set_promisc(struct vtnet_softc *sc, int on) { return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on)); } static int vtnet_set_allmulti(struct vtnet_softc *sc, int on) { return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on)); } /* * The device defaults to promiscuous mode for backwards compatibility. * Turn it off at attach time if possible. */ static void vtnet_attach_disable_promisc(struct vtnet_softc *sc) { struct ifnet *ifp; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK(sc); if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) { ifp->if_flags |= IFF_PROMISC; } else if (vtnet_set_promisc(sc, 0) != 0) { ifp->if_flags |= IFF_PROMISC; device_printf(sc->vtnet_dev, "cannot disable default promiscuous mode\n"); } VTNET_CORE_UNLOCK(sc); } static void vtnet_rx_filter(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0) device_printf(dev, "cannot %s promiscuous mode\n", ifp->if_flags & IFF_PROMISC ? "enable" : "disable"); if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0) device_printf(dev, "cannot %s all-multicast mode\n", ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable"); } static void vtnet_rx_filter_mac(struct vtnet_softc *sc) { struct virtio_net_ctrl_hdr hdr __aligned(2); struct vtnet_mac_filter *filter; struct sglist_seg segs[4]; struct sglist sg; struct ifnet *ifp; struct ifaddr *ifa; struct ifmultiaddr *ifma; int ucnt, mcnt, promisc, allmulti, error; uint8_t ack; ifp = sc->vtnet_ifp; filter = sc->vtnet_mac_filter; ucnt = 0; mcnt = 0; promisc = 0; allmulti = 0; VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, ("%s: CTRL_RX feature not negotiated", __func__)); /* Unicast MAC addresses: */ if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; else if (memcmp(LLADDR((struct sockaddr_dl *)ifa->ifa_addr), sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0) continue; else if (ucnt == VTNET_MAX_MAC_ENTRIES) { promisc = 1; break; } bcopy(LLADDR((struct sockaddr_dl *)ifa->ifa_addr), &filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN); ucnt++; } if_addr_runlock(ifp); if (promisc != 0) { filter->vmf_unicast.nentries = 0; if_printf(ifp, "more than %d MAC addresses assigned, " "falling back to promiscuous mode\n", VTNET_MAX_MAC_ENTRIES); } else filter->vmf_unicast.nentries = ucnt; /* Multicast MAC addresses: */ if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; else if (mcnt == VTNET_MAX_MAC_ENTRIES) { allmulti = 1; break; } bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN); mcnt++; } if_maddr_runlock(ifp); if (allmulti != 0) { filter->vmf_multicast.nentries = 0; if_printf(ifp, "more than %d multicast MAC addresses " "assigned, falling back to all-multicast mode\n", VTNET_MAX_MAC_ENTRIES); } else filter->vmf_multicast.nentries = mcnt; if (promisc != 0 && allmulti != 0) goto out; hdr.class = VIRTIO_NET_CTRL_MAC; hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET; ack = VIRTIO_NET_ERR; sglist_init(&sg, 4, segs); error = 0; error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &filter->vmf_unicast, sizeof(uint32_t) + filter->vmf_unicast.nentries * ETHER_ADDR_LEN); error |= sglist_append(&sg, &filter->vmf_multicast, sizeof(uint32_t) + filter->vmf_multicast.nentries * ETHER_ADDR_LEN); error |= sglist_append(&sg, &ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 4, ("%s: error %d adding MAC filter msg to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); if (ack != VIRTIO_NET_OK) if_printf(ifp, "error setting host MAC filter table\n"); out: if (promisc != 0 && vtnet_set_promisc(sc, 1) != 0) if_printf(ifp, "cannot enable promiscuous mode\n"); if (allmulti != 0 && vtnet_set_allmulti(sc, 1) != 0) if_printf(ifp, "cannot enable all-multicast mode\n"); } static int vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr; uint8_t pad1; uint16_t tag; uint8_t pad2; uint8_t ack; } s __aligned(2); int error; s.hdr.class = VIRTIO_NET_CTRL_VLAN; s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL; s.tag = tag; s.ack = VIRTIO_NET_ERR; sglist_init(&sg, 3, segs); error = 0; error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.tag, sizeof(uint16_t)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); KASSERT(error == 0 && sg.sg_nseg == 3, ("%s: error %d adding VLAN message to sglist", __func__, error)); vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static void vtnet_rx_filter_vlan(struct vtnet_softc *sc) { uint32_t w; uint16_t tag; int i, bit; VTNET_CORE_LOCK_ASSERT(sc); KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER, ("%s: VLAN_FILTER feature not negotiated", __func__)); /* Enable the filter for each configured VLAN. */ for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) { w = sc->vtnet_vlan_filter[i]; while ((bit = ffs(w) - 1) != -1) { w &= ~(1 << bit); tag = sizeof(w) * CHAR_BIT * i + bit; if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) { device_printf(sc->vtnet_dev, "cannot enable VLAN %d filter\n", tag); } } } } static void vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { struct ifnet *ifp; int idx, bit; ifp = sc->vtnet_ifp; idx = (tag >> 5) & 0x7F; bit = tag & 0x1F; if (tag == 0 || tag > 4095) return; VTNET_CORE_LOCK(sc); if (add) sc->vtnet_vlan_filter[idx] |= (1 << bit); else sc->vtnet_vlan_filter[idx] &= ~(1 << bit); if (ifp->if_capenable & IFCAP_VLAN_HWFILTER && vtnet_exec_vlan_filter(sc, add, tag) != 0) { device_printf(sc->vtnet_dev, "cannot %s VLAN %d %s the host filter table\n", add ? "add" : "remove", tag, add ? "to" : "from"); } VTNET_CORE_UNLOCK(sc); } static void vtnet_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc != arg) return; vtnet_update_vlan_filter(arg, 1, tag); } static void vtnet_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc != arg) return; vtnet_update_vlan_filter(arg, 0, tag); } static int vtnet_is_link_up(struct vtnet_softc *sc) { device_t dev; struct ifnet *ifp; uint16_t status; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; if ((ifp->if_capabilities & IFCAP_LINKSTATE) == 0) status = VIRTIO_NET_S_LINK_UP; else status = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, status)); return ((status & VIRTIO_NET_S_LINK_UP) != 0); } static void vtnet_update_link_status(struct vtnet_softc *sc) { struct ifnet *ifp; int link; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); link = vtnet_is_link_up(sc); /* Notify if the link status has changed. */ if (link != 0 && sc->vtnet_link_active == 0) { sc->vtnet_link_active = 1; if_link_state_change(ifp, LINK_STATE_UP); } else if (link == 0 && sc->vtnet_link_active != 0) { sc->vtnet_link_active = 0; if_link_state_change(ifp, LINK_STATE_DOWN); } } static int vtnet_ifmedia_upd(struct ifnet *ifp) { struct vtnet_softc *sc; struct ifmedia *ifm; sc = ifp->if_softc; ifm = &sc->vtnet_media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); return (0); } static void vtnet_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vtnet_softc *sc; sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; VTNET_CORE_LOCK(sc); if (vtnet_is_link_up(sc) != 0) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= VTNET_MEDIATYPE; } else ifmr->ifm_active |= IFM_NONE; VTNET_CORE_UNLOCK(sc); } static void vtnet_set_hwaddr(struct vtnet_softc *sc) { device_t dev; int i; dev = sc->vtnet_dev; if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) { if (vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr) != 0) device_printf(dev, "unable to set MAC address\n"); } else if (sc->vtnet_flags & VTNET_FLAG_MAC) { for (i = 0; i < ETHER_ADDR_LEN; i++) { virtio_write_dev_config_1(dev, offsetof(struct virtio_net_config, mac) + i, sc->vtnet_hwaddr[i]); } } } static void vtnet_get_hwaddr(struct vtnet_softc *sc) { device_t dev; int i; dev = sc->vtnet_dev; if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0) { /* * Generate a random locally administered unicast address. * * It would be nice to generate the same MAC address across * reboots, but it seems all the hosts currently available * support the MAC feature, so this isn't too important. */ sc->vtnet_hwaddr[0] = 0xB2; arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0); vtnet_set_hwaddr(sc); return; } for (i = 0; i < ETHER_ADDR_LEN; i++) { sc->vtnet_hwaddr[i] = virtio_read_dev_config_1(dev, offsetof(struct virtio_net_config, mac) + i); } } static void vtnet_vlan_tag_remove(struct mbuf *m) { struct ether_vlan_header *evh; evh = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag); m->m_flags |= M_VLANTAG; /* Strip the 802.1Q header. */ bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } static void vtnet_set_rx_process_limit(struct vtnet_softc *sc) { int limit; limit = vtnet_tunable_int(sc, "rx_process_limit", vtnet_rx_process_limit); if (limit < 0) limit = INT_MAX; sc->vtnet_rx_process_limit = limit; } static void vtnet_set_tx_intr_threshold(struct vtnet_softc *sc) { device_t dev; int size, thresh; dev = sc->vtnet_dev; size = virtqueue_size(sc->vtnet_txqs[0].vtntx_vq); /* * The Tx interrupt is disabled until the queue free count falls * below our threshold. Completed frames are drained from the Tx * virtqueue before transmitting new frames and in the watchdog * callout, so the frequency of Tx interrupts is greatly reduced, * at the cost of not freeing mbufs as quickly as they otherwise * would be. * * N.B. We assume all the Tx queues are the same size. */ thresh = size / 4; /* * Without indirect descriptors, leave enough room for the most * segments we handle. */ if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC) == 0 && thresh < sc->vtnet_tx_nsegs) thresh = sc->vtnet_tx_nsegs; sc->vtnet_tx_intr_thresh = thresh; } static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_rxq *rxq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct vtnet_rxq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Receive Queue"); list = SYSCTL_CHILDREN(node); stats = &rxq->vtnrx_stats; SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD, &stats->vrxs_ipackets, "Receive packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD, &stats->vrxs_ibytes, "Receive bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD, &stats->vrxs_iqdrops, "Receive drops"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD, &stats->vrxs_ierrors, "Receive errors"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vrxs_csum, "Receive checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD, &stats->vrxs_csum_failed, "Receive checksum offload failed"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, &stats->vrxs_rescheduled, "Receive interrupt handler rescheduled"); } static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_txq *txq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct vtnet_txq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Transmit Queue"); list = SYSCTL_CHILDREN(node); stats = &txq->vtntx_stats; SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD, &stats->vtxs_opackets, "Transmit packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD, &stats->vtxs_obytes, "Transmit bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD, &stats->vtxs_omcasts, "Transmit multicasts"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vtxs_csum, "Transmit checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD, &stats->vtxs_tso, "Transmit segmentation offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, &stats->vtxs_rescheduled, "Transmit interrupt handler rescheduled"); } static void vtnet_setup_queue_sysctl(struct vtnet_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; int i; dev = sc->vtnet_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]); vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]); } } static void vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_softc *sc) { struct vtnet_statistics *stats; struct vtnet_rxq_stats rxaccum; struct vtnet_txq_stats txaccum; vtnet_accum_stats(sc, &rxaccum, &txaccum); stats = &sc->vtnet_stats; stats->rx_csum_offloaded = rxaccum.vrxs_csum; stats->rx_csum_failed = rxaccum.vrxs_csum_failed; stats->rx_task_rescheduled = rxaccum.vrxs_rescheduled; stats->tx_csum_offloaded = txaccum.vtxs_csum; stats->tx_tso_offloaded = txaccum.vtxs_tso; stats->tx_task_rescheduled = txaccum.vtxs_rescheduled; SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed", CTLFLAG_RD, &stats->mbuf_alloc_failed, "Mbuf cluster allocation failures"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large", CTLFLAG_RD, &stats->rx_frame_too_large, "Received frame larger than the mbuf chain"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed", CTLFLAG_RD, &stats->rx_enq_replacement_failed, "Enqueuing the replacement receive mbuf failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed", CTLFLAG_RD, &stats->rx_mergeable_failed, "Mergeable buffers receive failures"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype", CTLFLAG_RD, &stats->rx_csum_bad_ethtype, "Received checksum offloaded buffer with unsupported " "Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto", CTLFLAG_RD, &stats->rx_csum_bad_ipproto, "Received checksum offloaded buffer with incorrect IP protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset", CTLFLAG_RD, &stats->rx_csum_bad_offset, "Received checksum offloaded buffer with incorrect offset"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto", CTLFLAG_RD, &stats->rx_csum_bad_proto, "Received checksum offloaded buffer with incorrect protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed", CTLFLAG_RD, &stats->rx_csum_failed, "Received buffer checksum offload failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded", CTLFLAG_RD, &stats->rx_csum_offloaded, "Received buffer checksum offload succeeded"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled", CTLFLAG_RD, &stats->rx_task_rescheduled, "Times the receive interrupt task rescheduled itself"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_bad_ethtype", CTLFLAG_RD, &stats->tx_csum_bad_ethtype, "Aborted transmit of checksum offloaded buffer with unknown " "Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_bad_ethtype", CTLFLAG_RD, &stats->tx_tso_bad_ethtype, "Aborted transmit of TSO buffer with unknown Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp", CTLFLAG_RD, &stats->tx_tso_not_tcp, "Aborted transmit of TSO buffer with non TCP protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defragged", CTLFLAG_RD, &stats->tx_defragged, "Transmit mbufs defragged"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defrag_failed", CTLFLAG_RD, &stats->tx_defrag_failed, "Aborted transmit of buffer because defrag failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded", CTLFLAG_RD, &stats->tx_csum_offloaded, "Offloaded checksum of transmitted buffer"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded", CTLFLAG_RD, &stats->tx_tso_offloaded, "Segmentation offload of transmitted buffer"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled", CTLFLAG_RD, &stats->tx_task_rescheduled, "Times the transmit interrupt task rescheduled itself"); } static void vtnet_setup_sysctl(struct vtnet_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vtnet_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs", CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0, "Maximum number of supported virtqueue pairs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs", CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0, "Number of active virtqueue pairs"); vtnet_setup_stat_sysctl(ctx, child, sc); } static int vtnet_rxq_enable_intr(struct vtnet_rxq *rxq) { return (virtqueue_enable_intr(rxq->vtnrx_vq)); } static void vtnet_rxq_disable_intr(struct vtnet_rxq *rxq) { virtqueue_disable_intr(rxq->vtnrx_vq); } static int vtnet_txq_enable_intr(struct vtnet_txq *txq) { struct virtqueue *vq; vq = txq->vtntx_vq; if (vtnet_txq_below_threshold(txq) != 0) return (virtqueue_postpone_intr(vq, VQ_POSTPONE_LONG)); /* * The free count is above our threshold. Keep the Tx interrupt * disabled until the queue is fuller. */ return (0); } static void vtnet_txq_disable_intr(struct vtnet_txq *txq) { virtqueue_disable_intr(txq->vtntx_vq); } static void vtnet_enable_rx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_rxq_enable_intr(&sc->vtnet_rxqs[i]); } static void vtnet_enable_tx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_txq_enable_intr(&sc->vtnet_txqs[i]); } static void vtnet_enable_interrupts(struct vtnet_softc *sc) { vtnet_enable_rx_interrupts(sc); vtnet_enable_tx_interrupts(sc); } static void vtnet_disable_rx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]); } static void vtnet_disable_tx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_txq_disable_intr(&sc->vtnet_txqs[i]); } static void vtnet_disable_interrupts(struct vtnet_softc *sc) { vtnet_disable_rx_interrupts(sc); vtnet_disable_tx_interrupts(sc); } static int vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob); TUNABLE_INT_FETCH(path, &def); return (def); } Index: head/sys/dev/vmware/vmxnet3/if_vmx.c =================================================================== --- head/sys/dev/vmware/vmxnet3/if_vmx.c (revision 275357) +++ head/sys/dev/vmware/vmxnet3/if_vmx.c (revision 275358) @@ -1,3914 +1,3915 @@ /*- * Copyright (c) 2013 Tsubai Masanari * Copyright (c) 2013 Bryan Venteicher * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * $OpenBSD: src/sys/dev/pci/if_vmx.c,v 1.11 2013/06/22 00:28:10 uebayasi Exp $ */ /* Driver for VMware vmxnet3 virtual ethernet devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "if_vmxreg.h" #include "if_vmxvar.h" #include "opt_inet.h" #include "opt_inet6.h" #ifdef VMXNET3_FAILPOINTS #include static SYSCTL_NODE(DEBUG_FP, OID_AUTO, vmxnet3, CTLFLAG_RW, 0, "vmxnet3 fail points"); #define VMXNET3_FP _debug_fail_point_vmxnet3 #endif static int vmxnet3_probe(device_t); static int vmxnet3_attach(device_t); static int vmxnet3_detach(device_t); static int vmxnet3_shutdown(device_t); static int vmxnet3_alloc_resources(struct vmxnet3_softc *); static void vmxnet3_free_resources(struct vmxnet3_softc *); static int vmxnet3_check_version(struct vmxnet3_softc *); static void vmxnet3_initial_config(struct vmxnet3_softc *); static void vmxnet3_check_multiqueue(struct vmxnet3_softc *); static int vmxnet3_alloc_msix_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_msi_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_legacy_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_interrupt(struct vmxnet3_softc *, int, int, struct vmxnet3_interrupt *); static int vmxnet3_alloc_intr_resources(struct vmxnet3_softc *); static int vmxnet3_setup_msix_interrupts(struct vmxnet3_softc *); static int vmxnet3_setup_legacy_interrupt(struct vmxnet3_softc *); static int vmxnet3_setup_interrupts(struct vmxnet3_softc *); static int vmxnet3_alloc_interrupts(struct vmxnet3_softc *); static void vmxnet3_free_interrupt(struct vmxnet3_softc *, struct vmxnet3_interrupt *); static void vmxnet3_free_interrupts(struct vmxnet3_softc *); #ifndef VMXNET3_LEGACY_TX static int vmxnet3_alloc_taskqueue(struct vmxnet3_softc *); static void vmxnet3_start_taskqueue(struct vmxnet3_softc *); static void vmxnet3_drain_taskqueue(struct vmxnet3_softc *); static void vmxnet3_free_taskqueue(struct vmxnet3_softc *); #endif static int vmxnet3_init_rxq(struct vmxnet3_softc *, int); static int vmxnet3_init_txq(struct vmxnet3_softc *, int); static int vmxnet3_alloc_rxtx_queues(struct vmxnet3_softc *); static void vmxnet3_destroy_rxq(struct vmxnet3_rxqueue *); static void vmxnet3_destroy_txq(struct vmxnet3_txqueue *); static void vmxnet3_free_rxtx_queues(struct vmxnet3_softc *); static int vmxnet3_alloc_shared_data(struct vmxnet3_softc *); static void vmxnet3_free_shared_data(struct vmxnet3_softc *); static int vmxnet3_alloc_txq_data(struct vmxnet3_softc *); static void vmxnet3_free_txq_data(struct vmxnet3_softc *); static int vmxnet3_alloc_rxq_data(struct vmxnet3_softc *); static void vmxnet3_free_rxq_data(struct vmxnet3_softc *); static int vmxnet3_alloc_queue_data(struct vmxnet3_softc *); static void vmxnet3_free_queue_data(struct vmxnet3_softc *); static int vmxnet3_alloc_mcast_table(struct vmxnet3_softc *); static void vmxnet3_init_shared_data(struct vmxnet3_softc *); static void vmxnet3_reinit_interface(struct vmxnet3_softc *); static void vmxnet3_reinit_rss_shared_data(struct vmxnet3_softc *); static void vmxnet3_reinit_shared_data(struct vmxnet3_softc *); static int vmxnet3_alloc_data(struct vmxnet3_softc *); static void vmxnet3_free_data(struct vmxnet3_softc *); static int vmxnet3_setup_interface(struct vmxnet3_softc *); static void vmxnet3_evintr(struct vmxnet3_softc *); static void vmxnet3_txq_eof(struct vmxnet3_txqueue *); static void vmxnet3_rx_csum(struct vmxnet3_rxcompdesc *, struct mbuf *); static int vmxnet3_newbuf(struct vmxnet3_softc *, struct vmxnet3_rxring *); static void vmxnet3_rxq_eof_discard(struct vmxnet3_rxqueue *, struct vmxnet3_rxring *, int); static void vmxnet3_rxq_eof(struct vmxnet3_rxqueue *); static void vmxnet3_legacy_intr(void *); static void vmxnet3_txq_intr(void *); static void vmxnet3_rxq_intr(void *); static void vmxnet3_event_intr(void *); static void vmxnet3_txstop(struct vmxnet3_softc *, struct vmxnet3_txqueue *); static void vmxnet3_rxstop(struct vmxnet3_softc *, struct vmxnet3_rxqueue *); static void vmxnet3_stop(struct vmxnet3_softc *); static void vmxnet3_txinit(struct vmxnet3_softc *, struct vmxnet3_txqueue *); static int vmxnet3_rxinit(struct vmxnet3_softc *, struct vmxnet3_rxqueue *); static int vmxnet3_reinit_queues(struct vmxnet3_softc *); static int vmxnet3_enable_device(struct vmxnet3_softc *); static void vmxnet3_reinit_rxfilters(struct vmxnet3_softc *); static int vmxnet3_reinit(struct vmxnet3_softc *); static void vmxnet3_init_locked(struct vmxnet3_softc *); static void vmxnet3_init(void *); static int vmxnet3_txq_offload_ctx(struct vmxnet3_txqueue *,struct mbuf *, int *, int *, int *); static int vmxnet3_txq_load_mbuf(struct vmxnet3_txqueue *, struct mbuf **, bus_dmamap_t, bus_dma_segment_t [], int *); static void vmxnet3_txq_unload_mbuf(struct vmxnet3_txqueue *, bus_dmamap_t); static int vmxnet3_txq_encap(struct vmxnet3_txqueue *, struct mbuf **); #ifdef VMXNET3_LEGACY_TX static void vmxnet3_start_locked(struct ifnet *); static void vmxnet3_start(struct ifnet *); #else static int vmxnet3_txq_mq_start_locked(struct vmxnet3_txqueue *, struct mbuf *); static int vmxnet3_txq_mq_start(struct ifnet *, struct mbuf *); static void vmxnet3_txq_tq_deferred(void *, int); #endif static void vmxnet3_txq_start(struct vmxnet3_txqueue *); static void vmxnet3_tx_start_all(struct vmxnet3_softc *); static void vmxnet3_update_vlan_filter(struct vmxnet3_softc *, int, uint16_t); static void vmxnet3_register_vlan(void *, struct ifnet *, uint16_t); static void vmxnet3_unregister_vlan(void *, struct ifnet *, uint16_t); static void vmxnet3_set_rxfilter(struct vmxnet3_softc *); static int vmxnet3_change_mtu(struct vmxnet3_softc *, int); static int vmxnet3_ioctl(struct ifnet *, u_long, caddr_t); static uint64_t vmxnet3_get_counter(struct ifnet *, ift_counter); #ifndef VMXNET3_LEGACY_TX static void vmxnet3_qflush(struct ifnet *); #endif static int vmxnet3_watchdog(struct vmxnet3_txqueue *); static void vmxnet3_refresh_host_stats(struct vmxnet3_softc *); static void vmxnet3_tick(void *); static void vmxnet3_link_status(struct vmxnet3_softc *); static void vmxnet3_media_status(struct ifnet *, struct ifmediareq *); static int vmxnet3_media_change(struct ifnet *); static void vmxnet3_set_lladdr(struct vmxnet3_softc *); static void vmxnet3_get_lladdr(struct vmxnet3_softc *); static void vmxnet3_setup_txq_sysctl(struct vmxnet3_txqueue *, struct sysctl_ctx_list *, struct sysctl_oid_list *); static void vmxnet3_setup_rxq_sysctl(struct vmxnet3_rxqueue *, struct sysctl_ctx_list *, struct sysctl_oid_list *); static void vmxnet3_setup_queue_sysctl(struct vmxnet3_softc *, struct sysctl_ctx_list *, struct sysctl_oid_list *); static void vmxnet3_setup_sysctl(struct vmxnet3_softc *); static void vmxnet3_write_bar0(struct vmxnet3_softc *, bus_size_t, uint32_t); static uint32_t vmxnet3_read_bar1(struct vmxnet3_softc *, bus_size_t); static void vmxnet3_write_bar1(struct vmxnet3_softc *, bus_size_t, uint32_t); static void vmxnet3_write_cmd(struct vmxnet3_softc *, uint32_t); static uint32_t vmxnet3_read_cmd(struct vmxnet3_softc *, uint32_t); static void vmxnet3_enable_intr(struct vmxnet3_softc *, int); static void vmxnet3_disable_intr(struct vmxnet3_softc *, int); static void vmxnet3_enable_all_intrs(struct vmxnet3_softc *); static void vmxnet3_disable_all_intrs(struct vmxnet3_softc *); static int vmxnet3_dma_malloc(struct vmxnet3_softc *, bus_size_t, bus_size_t, struct vmxnet3_dma_alloc *); static void vmxnet3_dma_free(struct vmxnet3_softc *, struct vmxnet3_dma_alloc *); static int vmxnet3_tunable_int(struct vmxnet3_softc *, const char *, int); typedef enum { VMXNET3_BARRIER_RD, VMXNET3_BARRIER_WR, VMXNET3_BARRIER_RDWR, } vmxnet3_barrier_t; static void vmxnet3_barrier(struct vmxnet3_softc *, vmxnet3_barrier_t); /* Tunables. */ static int vmxnet3_mq_disable = 0; TUNABLE_INT("hw.vmx.mq_disable", &vmxnet3_mq_disable); static int vmxnet3_default_txnqueue = VMXNET3_DEF_TX_QUEUES; TUNABLE_INT("hw.vmx.txnqueue", &vmxnet3_default_txnqueue); static int vmxnet3_default_rxnqueue = VMXNET3_DEF_RX_QUEUES; TUNABLE_INT("hw.vmx.rxnqueue", &vmxnet3_default_rxnqueue); static int vmxnet3_default_txndesc = VMXNET3_DEF_TX_NDESC; TUNABLE_INT("hw.vmx.txndesc", &vmxnet3_default_txndesc); static int vmxnet3_default_rxndesc = VMXNET3_DEF_RX_NDESC; TUNABLE_INT("hw.vmx.rxndesc", &vmxnet3_default_rxndesc); static device_method_t vmxnet3_methods[] = { /* Device interface. */ DEVMETHOD(device_probe, vmxnet3_probe), DEVMETHOD(device_attach, vmxnet3_attach), DEVMETHOD(device_detach, vmxnet3_detach), DEVMETHOD(device_shutdown, vmxnet3_shutdown), DEVMETHOD_END }; static driver_t vmxnet3_driver = { "vmx", vmxnet3_methods, sizeof(struct vmxnet3_softc) }; static devclass_t vmxnet3_devclass; DRIVER_MODULE(vmx, pci, vmxnet3_driver, vmxnet3_devclass, 0, 0); MODULE_DEPEND(vmx, pci, 1, 1, 1); MODULE_DEPEND(vmx, ether, 1, 1, 1); #define VMXNET3_VMWARE_VENDOR_ID 0x15AD #define VMXNET3_VMWARE_DEVICE_ID 0x07B0 static int vmxnet3_probe(device_t dev) { if (pci_get_vendor(dev) == VMXNET3_VMWARE_VENDOR_ID && pci_get_device(dev) == VMXNET3_VMWARE_DEVICE_ID) { device_set_desc(dev, "VMware VMXNET3 Ethernet Adapter"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } static int vmxnet3_attach(device_t dev) { struct vmxnet3_softc *sc; int error; sc = device_get_softc(dev); sc->vmx_dev = dev; pci_enable_busmaster(dev); VMXNET3_CORE_LOCK_INIT(sc, device_get_nameunit(dev)); callout_init_mtx(&sc->vmx_tick, &sc->vmx_mtx, 0); vmxnet3_initial_config(sc); error = vmxnet3_alloc_resources(sc); if (error) goto fail; error = vmxnet3_check_version(sc); if (error) goto fail; error = vmxnet3_alloc_rxtx_queues(sc); if (error) goto fail; #ifndef VMXNET3_LEGACY_TX error = vmxnet3_alloc_taskqueue(sc); if (error) goto fail; #endif error = vmxnet3_alloc_interrupts(sc); if (error) goto fail; vmxnet3_check_multiqueue(sc); error = vmxnet3_alloc_data(sc); if (error) goto fail; error = vmxnet3_setup_interface(sc); if (error) goto fail; error = vmxnet3_setup_interrupts(sc); if (error) { ether_ifdetach(sc->vmx_ifp); device_printf(dev, "could not set up interrupt\n"); goto fail; } vmxnet3_setup_sysctl(sc); #ifndef VMXNET3_LEGACY_TX vmxnet3_start_taskqueue(sc); #endif fail: if (error) vmxnet3_detach(dev); return (error); } static int vmxnet3_detach(device_t dev) { struct vmxnet3_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); ifp = sc->vmx_ifp; if (device_is_attached(dev)) { VMXNET3_CORE_LOCK(sc); vmxnet3_stop(sc); VMXNET3_CORE_UNLOCK(sc); callout_drain(&sc->vmx_tick); #ifndef VMXNET3_LEGACY_TX vmxnet3_drain_taskqueue(sc); #endif ether_ifdetach(ifp); } if (sc->vmx_vlan_attach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vmx_vlan_attach); sc->vmx_vlan_attach = NULL; } if (sc->vmx_vlan_detach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vmx_vlan_detach); sc->vmx_vlan_detach = NULL; } #ifndef VMXNET3_LEGACY_TX vmxnet3_free_taskqueue(sc); #endif vmxnet3_free_interrupts(sc); if (ifp != NULL) { if_free(ifp); sc->vmx_ifp = NULL; } ifmedia_removeall(&sc->vmx_media); vmxnet3_free_data(sc); vmxnet3_free_resources(sc); vmxnet3_free_rxtx_queues(sc); VMXNET3_CORE_LOCK_DESTROY(sc); return (0); } static int vmxnet3_shutdown(device_t dev) { return (0); } static int vmxnet3_alloc_resources(struct vmxnet3_softc *sc) { device_t dev; int rid; dev = sc->vmx_dev; rid = PCIR_BAR(0); sc->vmx_res0 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->vmx_res0 == NULL) { device_printf(dev, "could not map BAR0 memory\n"); return (ENXIO); } sc->vmx_iot0 = rman_get_bustag(sc->vmx_res0); sc->vmx_ioh0 = rman_get_bushandle(sc->vmx_res0); rid = PCIR_BAR(1); sc->vmx_res1 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->vmx_res1 == NULL) { device_printf(dev, "could not map BAR1 memory\n"); return (ENXIO); } sc->vmx_iot1 = rman_get_bustag(sc->vmx_res1); sc->vmx_ioh1 = rman_get_bushandle(sc->vmx_res1); if (pci_find_cap(dev, PCIY_MSIX, NULL) == 0) { rid = PCIR_BAR(2); sc->vmx_msix_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); } if (sc->vmx_msix_res == NULL) sc->vmx_flags |= VMXNET3_FLAG_NO_MSIX; return (0); } static void vmxnet3_free_resources(struct vmxnet3_softc *sc) { device_t dev; int rid; dev = sc->vmx_dev; if (sc->vmx_res0 != NULL) { rid = PCIR_BAR(0); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->vmx_res0); sc->vmx_res0 = NULL; } if (sc->vmx_res1 != NULL) { rid = PCIR_BAR(1); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->vmx_res1); sc->vmx_res1 = NULL; } if (sc->vmx_msix_res != NULL) { rid = PCIR_BAR(2); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->vmx_msix_res); sc->vmx_msix_res = NULL; } } static int vmxnet3_check_version(struct vmxnet3_softc *sc) { device_t dev; uint32_t version; dev = sc->vmx_dev; version = vmxnet3_read_bar1(sc, VMXNET3_BAR1_VRRS); if ((version & 0x01) == 0) { device_printf(dev, "unsupported hardware version %#x\n", version); return (ENOTSUP); } vmxnet3_write_bar1(sc, VMXNET3_BAR1_VRRS, 1); version = vmxnet3_read_bar1(sc, VMXNET3_BAR1_UVRS); if ((version & 0x01) == 0) { device_printf(dev, "unsupported UPT version %#x\n", version); return (ENOTSUP); } vmxnet3_write_bar1(sc, VMXNET3_BAR1_UVRS, 1); return (0); } static void vmxnet3_initial_config(struct vmxnet3_softc *sc) { int nqueue, ndesc; nqueue = vmxnet3_tunable_int(sc, "txnqueue", vmxnet3_default_txnqueue); if (nqueue > VMXNET3_MAX_TX_QUEUES || nqueue < 1) nqueue = VMXNET3_DEF_TX_QUEUES; if (nqueue > mp_ncpus) nqueue = mp_ncpus; sc->vmx_max_ntxqueues = nqueue; nqueue = vmxnet3_tunable_int(sc, "rxnqueue", vmxnet3_default_rxnqueue); if (nqueue > VMXNET3_MAX_RX_QUEUES || nqueue < 1) nqueue = VMXNET3_DEF_RX_QUEUES; if (nqueue > mp_ncpus) nqueue = mp_ncpus; sc->vmx_max_nrxqueues = nqueue; if (vmxnet3_tunable_int(sc, "mq_disable", vmxnet3_mq_disable)) { sc->vmx_max_nrxqueues = 1; sc->vmx_max_ntxqueues = 1; } ndesc = vmxnet3_tunable_int(sc, "txd", vmxnet3_default_txndesc); if (ndesc > VMXNET3_MAX_TX_NDESC || ndesc < VMXNET3_MIN_TX_NDESC) ndesc = VMXNET3_DEF_TX_NDESC; if (ndesc & VMXNET3_MASK_TX_NDESC) ndesc &= ~VMXNET3_MASK_TX_NDESC; sc->vmx_ntxdescs = ndesc; ndesc = vmxnet3_tunable_int(sc, "rxd", vmxnet3_default_rxndesc); if (ndesc > VMXNET3_MAX_RX_NDESC || ndesc < VMXNET3_MIN_RX_NDESC) ndesc = VMXNET3_DEF_RX_NDESC; if (ndesc & VMXNET3_MASK_RX_NDESC) ndesc &= ~VMXNET3_MASK_RX_NDESC; sc->vmx_nrxdescs = ndesc; sc->vmx_max_rxsegs = VMXNET3_MAX_RX_SEGS; } static void vmxnet3_check_multiqueue(struct vmxnet3_softc *sc) { if (sc->vmx_intr_type != VMXNET3_IT_MSIX) goto out; /* BMV: Just use the maximum configured for now. */ sc->vmx_nrxqueues = sc->vmx_max_nrxqueues; sc->vmx_ntxqueues = sc->vmx_max_ntxqueues; if (sc->vmx_nrxqueues > 1) sc->vmx_flags |= VMXNET3_FLAG_RSS; return; out: sc->vmx_ntxqueues = 1; sc->vmx_nrxqueues = 1; } static int vmxnet3_alloc_msix_interrupts(struct vmxnet3_softc *sc) { device_t dev; int nmsix, cnt, required; dev = sc->vmx_dev; if (sc->vmx_flags & VMXNET3_FLAG_NO_MSIX) return (1); /* Allocate an additional vector for the events interrupt. */ required = sc->vmx_max_nrxqueues + sc->vmx_max_ntxqueues + 1; nmsix = pci_msix_count(dev); if (nmsix < required) return (1); cnt = required; if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) { sc->vmx_nintrs = required; return (0); } else pci_release_msi(dev); /* BMV TODO Fallback to sharing MSIX vectors if possible. */ return (1); } static int vmxnet3_alloc_msi_interrupts(struct vmxnet3_softc *sc) { device_t dev; int nmsi, cnt, required; dev = sc->vmx_dev; required = 1; nmsi = pci_msi_count(dev); if (nmsi < required) return (1); cnt = required; if (pci_alloc_msi(dev, &cnt) == 0 && cnt >= required) { sc->vmx_nintrs = 1; return (0); } else pci_release_msi(dev); return (1); } static int vmxnet3_alloc_legacy_interrupts(struct vmxnet3_softc *sc) { sc->vmx_nintrs = 1; return (0); } static int vmxnet3_alloc_interrupt(struct vmxnet3_softc *sc, int rid, int flags, struct vmxnet3_interrupt *intr) { struct resource *irq; irq = bus_alloc_resource_any(sc->vmx_dev, SYS_RES_IRQ, &rid, flags); if (irq == NULL) return (ENXIO); intr->vmxi_irq = irq; intr->vmxi_rid = rid; return (0); } static int vmxnet3_alloc_intr_resources(struct vmxnet3_softc *sc) { int i, rid, flags, error; rid = 0; flags = RF_ACTIVE; if (sc->vmx_intr_type == VMXNET3_IT_LEGACY) flags |= RF_SHAREABLE; else rid = 1; for (i = 0; i < sc->vmx_nintrs; i++, rid++) { error = vmxnet3_alloc_interrupt(sc, rid, flags, &sc->vmx_intrs[i]); if (error) return (error); } return (0); } static int vmxnet3_setup_msix_interrupts(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_txqueue *txq; struct vmxnet3_rxqueue *rxq; struct vmxnet3_interrupt *intr; enum intr_type type; int i, error; dev = sc->vmx_dev; intr = &sc->vmx_intrs[0]; type = INTR_TYPE_NET | INTR_MPSAFE; for (i = 0; i < sc->vmx_ntxqueues; i++, intr++) { txq = &sc->vmx_txq[i]; error = bus_setup_intr(dev, intr->vmxi_irq, type, NULL, vmxnet3_txq_intr, txq, &intr->vmxi_handler); if (error) return (error); bus_describe_intr(dev, intr->vmxi_irq, intr->vmxi_handler, "tq%d", i); txq->vxtxq_intr_idx = intr->vmxi_rid - 1; } for (i = 0; i < sc->vmx_nrxqueues; i++, intr++) { rxq = &sc->vmx_rxq[i]; error = bus_setup_intr(dev, intr->vmxi_irq, type, NULL, vmxnet3_rxq_intr, rxq, &intr->vmxi_handler); if (error) return (error); bus_describe_intr(dev, intr->vmxi_irq, intr->vmxi_handler, "rq%d", i); rxq->vxrxq_intr_idx = intr->vmxi_rid - 1; } error = bus_setup_intr(dev, intr->vmxi_irq, type, NULL, vmxnet3_event_intr, sc, &intr->vmxi_handler); if (error) return (error); bus_describe_intr(dev, intr->vmxi_irq, intr->vmxi_handler, "event"); sc->vmx_event_intr_idx = intr->vmxi_rid - 1; return (0); } static int vmxnet3_setup_legacy_interrupt(struct vmxnet3_softc *sc) { struct vmxnet3_interrupt *intr; int i, error; intr = &sc->vmx_intrs[0]; error = bus_setup_intr(sc->vmx_dev, intr->vmxi_irq, INTR_TYPE_NET | INTR_MPSAFE, NULL, vmxnet3_legacy_intr, sc, &intr->vmxi_handler); for (i = 0; i < sc->vmx_ntxqueues; i++) sc->vmx_txq[i].vxtxq_intr_idx = 0; for (i = 0; i < sc->vmx_nrxqueues; i++) sc->vmx_rxq[i].vxrxq_intr_idx = 0; sc->vmx_event_intr_idx = 0; return (error); } static void vmxnet3_set_interrupt_idx(struct vmxnet3_softc *sc) { struct vmxnet3_txqueue *txq; struct vmxnet3_txq_shared *txs; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxq_shared *rxs; int i; sc->vmx_ds->evintr = sc->vmx_event_intr_idx; for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; txs = txq->vxtxq_ts; txs->intr_idx = txq->vxtxq_intr_idx; } for (i = 0; i < sc->vmx_nrxqueues; i++) { rxq = &sc->vmx_rxq[i]; rxs = rxq->vxrxq_rs; rxs->intr_idx = rxq->vxrxq_intr_idx; } } static int vmxnet3_setup_interrupts(struct vmxnet3_softc *sc) { int error; error = vmxnet3_alloc_intr_resources(sc); if (error) return (error); switch (sc->vmx_intr_type) { case VMXNET3_IT_MSIX: error = vmxnet3_setup_msix_interrupts(sc); break; case VMXNET3_IT_MSI: case VMXNET3_IT_LEGACY: error = vmxnet3_setup_legacy_interrupt(sc); break; default: panic("%s: invalid interrupt type %d", __func__, sc->vmx_intr_type); } if (error == 0) vmxnet3_set_interrupt_idx(sc); return (error); } static int vmxnet3_alloc_interrupts(struct vmxnet3_softc *sc) { device_t dev; uint32_t config; int error; dev = sc->vmx_dev; config = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_INTRCFG); sc->vmx_intr_type = config & 0x03; sc->vmx_intr_mask_mode = (config >> 2) & 0x03; switch (sc->vmx_intr_type) { case VMXNET3_IT_AUTO: sc->vmx_intr_type = VMXNET3_IT_MSIX; /* FALLTHROUGH */ case VMXNET3_IT_MSIX: error = vmxnet3_alloc_msix_interrupts(sc); if (error == 0) break; sc->vmx_intr_type = VMXNET3_IT_MSI; /* FALLTHROUGH */ case VMXNET3_IT_MSI: error = vmxnet3_alloc_msi_interrupts(sc); if (error == 0) break; sc->vmx_intr_type = VMXNET3_IT_LEGACY; /* FALLTHROUGH */ case VMXNET3_IT_LEGACY: error = vmxnet3_alloc_legacy_interrupts(sc); if (error == 0) break; /* FALLTHROUGH */ default: sc->vmx_intr_type = -1; device_printf(dev, "cannot allocate any interrupt resources\n"); return (ENXIO); } return (error); } static void vmxnet3_free_interrupt(struct vmxnet3_softc *sc, struct vmxnet3_interrupt *intr) { device_t dev; dev = sc->vmx_dev; if (intr->vmxi_handler != NULL) { bus_teardown_intr(dev, intr->vmxi_irq, intr->vmxi_handler); intr->vmxi_handler = NULL; } if (intr->vmxi_irq != NULL) { bus_release_resource(dev, SYS_RES_IRQ, intr->vmxi_rid, intr->vmxi_irq); intr->vmxi_irq = NULL; intr->vmxi_rid = -1; } } static void vmxnet3_free_interrupts(struct vmxnet3_softc *sc) { int i; for (i = 0; i < sc->vmx_nintrs; i++) vmxnet3_free_interrupt(sc, &sc->vmx_intrs[i]); if (sc->vmx_intr_type == VMXNET3_IT_MSI || sc->vmx_intr_type == VMXNET3_IT_MSIX) pci_release_msi(sc->vmx_dev); } #ifndef VMXNET3_LEGACY_TX static int vmxnet3_alloc_taskqueue(struct vmxnet3_softc *sc) { device_t dev; dev = sc->vmx_dev; sc->vmx_tq = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, taskqueue_thread_enqueue, &sc->vmx_tq); if (sc->vmx_tq == NULL) return (ENOMEM); return (0); } static void vmxnet3_start_taskqueue(struct vmxnet3_softc *sc) { device_t dev; int nthreads, error; dev = sc->vmx_dev; /* * The taskqueue is typically not frequently used, so a dedicated * thread for each queue is unnecessary. */ nthreads = MAX(1, sc->vmx_ntxqueues / 2); /* * Most drivers just ignore the return value - it only fails * with ENOMEM so an error is not likely. It is hard for us * to recover from an error here. */ error = taskqueue_start_threads(&sc->vmx_tq, nthreads, PI_NET, "%s taskq", device_get_nameunit(dev)); if (error) device_printf(dev, "failed to start taskqueue: %d", error); } static void vmxnet3_drain_taskqueue(struct vmxnet3_softc *sc) { struct vmxnet3_txqueue *txq; int i; if (sc->vmx_tq != NULL) { for (i = 0; i < sc->vmx_max_ntxqueues; i++) { txq = &sc->vmx_txq[i]; taskqueue_drain(sc->vmx_tq, &txq->vxtxq_defrtask); } } } static void vmxnet3_free_taskqueue(struct vmxnet3_softc *sc) { if (sc->vmx_tq != NULL) { taskqueue_free(sc->vmx_tq); sc->vmx_tq = NULL; } } #endif static int vmxnet3_init_rxq(struct vmxnet3_softc *sc, int q) { struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxring *rxr; int i; rxq = &sc->vmx_rxq[q]; snprintf(rxq->vxrxq_name, sizeof(rxq->vxrxq_name), "%s-rx%d", device_get_nameunit(sc->vmx_dev), q); mtx_init(&rxq->vxrxq_mtx, rxq->vxrxq_name, NULL, MTX_DEF); rxq->vxrxq_sc = sc; rxq->vxrxq_id = q; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; rxr->vxrxr_rid = i; rxr->vxrxr_ndesc = sc->vmx_nrxdescs; rxr->vxrxr_rxbuf = malloc(rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxbuf), M_DEVBUF, M_NOWAIT | M_ZERO); if (rxr->vxrxr_rxbuf == NULL) return (ENOMEM); rxq->vxrxq_comp_ring.vxcr_ndesc += sc->vmx_nrxdescs; } return (0); } static int vmxnet3_init_txq(struct vmxnet3_softc *sc, int q) { struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; txq = &sc->vmx_txq[q]; txr = &txq->vxtxq_cmd_ring; snprintf(txq->vxtxq_name, sizeof(txq->vxtxq_name), "%s-tx%d", device_get_nameunit(sc->vmx_dev), q); mtx_init(&txq->vxtxq_mtx, txq->vxtxq_name, NULL, MTX_DEF); txq->vxtxq_sc = sc; txq->vxtxq_id = q; txr->vxtxr_ndesc = sc->vmx_ntxdescs; txr->vxtxr_txbuf = malloc(txr->vxtxr_ndesc * sizeof(struct vmxnet3_txbuf), M_DEVBUF, M_NOWAIT | M_ZERO); if (txr->vxtxr_txbuf == NULL) return (ENOMEM); txq->vxtxq_comp_ring.vxcr_ndesc = sc->vmx_ntxdescs; #ifndef VMXNET3_LEGACY_TX TASK_INIT(&txq->vxtxq_defrtask, 0, vmxnet3_txq_tq_deferred, txq); txq->vxtxq_br = buf_ring_alloc(VMXNET3_DEF_BUFRING_SIZE, M_DEVBUF, M_NOWAIT, &txq->vxtxq_mtx); if (txq->vxtxq_br == NULL) return (ENOMEM); #endif return (0); } static int vmxnet3_alloc_rxtx_queues(struct vmxnet3_softc *sc) { int i, error; /* * Only attempt to create multiple queues if MSIX is available. MSIX is * disabled by default because its apparently broken for devices passed * through by at least ESXi 5.1. The hw.pci.honor_msi_blacklist tunable * must be set to zero for MSIX. This check prevents us from allocating * queue structures that we will not use. */ if (sc->vmx_flags & VMXNET3_FLAG_NO_MSIX) { sc->vmx_max_nrxqueues = 1; sc->vmx_max_ntxqueues = 1; } sc->vmx_rxq = malloc(sizeof(struct vmxnet3_rxqueue) * sc->vmx_max_nrxqueues, M_DEVBUF, M_NOWAIT | M_ZERO); sc->vmx_txq = malloc(sizeof(struct vmxnet3_txqueue) * sc->vmx_max_ntxqueues, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vmx_rxq == NULL || sc->vmx_txq == NULL) return (ENOMEM); for (i = 0; i < sc->vmx_max_nrxqueues; i++) { error = vmxnet3_init_rxq(sc, i); if (error) return (error); } for (i = 0; i < sc->vmx_max_ntxqueues; i++) { error = vmxnet3_init_txq(sc, i); if (error) return (error); } return (0); } static void vmxnet3_destroy_rxq(struct vmxnet3_rxqueue *rxq) { struct vmxnet3_rxring *rxr; int i; rxq->vxrxq_sc = NULL; rxq->vxrxq_id = -1; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; if (rxr->vxrxr_rxbuf != NULL) { free(rxr->vxrxr_rxbuf, M_DEVBUF); rxr->vxrxr_rxbuf = NULL; } } if (mtx_initialized(&rxq->vxrxq_mtx) != 0) mtx_destroy(&rxq->vxrxq_mtx); } static void vmxnet3_destroy_txq(struct vmxnet3_txqueue *txq) { struct vmxnet3_txring *txr; txr = &txq->vxtxq_cmd_ring; txq->vxtxq_sc = NULL; txq->vxtxq_id = -1; #ifndef VMXNET3_LEGACY_TX if (txq->vxtxq_br != NULL) { buf_ring_free(txq->vxtxq_br, M_DEVBUF); txq->vxtxq_br = NULL; } #endif if (txr->vxtxr_txbuf != NULL) { free(txr->vxtxr_txbuf, M_DEVBUF); txr->vxtxr_txbuf = NULL; } if (mtx_initialized(&txq->vxtxq_mtx) != 0) mtx_destroy(&txq->vxtxq_mtx); } static void vmxnet3_free_rxtx_queues(struct vmxnet3_softc *sc) { int i; if (sc->vmx_rxq != NULL) { for (i = 0; i < sc->vmx_max_nrxqueues; i++) vmxnet3_destroy_rxq(&sc->vmx_rxq[i]); free(sc->vmx_rxq, M_DEVBUF); sc->vmx_rxq = NULL; } if (sc->vmx_txq != NULL) { for (i = 0; i < sc->vmx_max_ntxqueues; i++) vmxnet3_destroy_txq(&sc->vmx_txq[i]); free(sc->vmx_txq, M_DEVBUF); sc->vmx_txq = NULL; } } static int vmxnet3_alloc_shared_data(struct vmxnet3_softc *sc) { device_t dev; uint8_t *kva; size_t size; int i, error; dev = sc->vmx_dev; size = sizeof(struct vmxnet3_driver_shared); error = vmxnet3_dma_malloc(sc, size, 1, &sc->vmx_ds_dma); if (error) { device_printf(dev, "cannot alloc shared memory\n"); return (error); } sc->vmx_ds = (struct vmxnet3_driver_shared *) sc->vmx_ds_dma.dma_vaddr; size = sc->vmx_ntxqueues * sizeof(struct vmxnet3_txq_shared) + sc->vmx_nrxqueues * sizeof(struct vmxnet3_rxq_shared); error = vmxnet3_dma_malloc(sc, size, 128, &sc->vmx_qs_dma); if (error) { device_printf(dev, "cannot alloc queue shared memory\n"); return (error); } sc->vmx_qs = (void *) sc->vmx_qs_dma.dma_vaddr; kva = sc->vmx_qs; for (i = 0; i < sc->vmx_ntxqueues; i++) { sc->vmx_txq[i].vxtxq_ts = (struct vmxnet3_txq_shared *) kva; kva += sizeof(struct vmxnet3_txq_shared); } for (i = 0; i < sc->vmx_nrxqueues; i++) { sc->vmx_rxq[i].vxrxq_rs = (struct vmxnet3_rxq_shared *) kva; kva += sizeof(struct vmxnet3_rxq_shared); } if (sc->vmx_flags & VMXNET3_FLAG_RSS) { size = sizeof(struct vmxnet3_rss_shared); error = vmxnet3_dma_malloc(sc, size, 128, &sc->vmx_rss_dma); if (error) { device_printf(dev, "cannot alloc rss shared memory\n"); return (error); } sc->vmx_rss = (struct vmxnet3_rss_shared *) sc->vmx_rss_dma.dma_vaddr; } return (0); } static void vmxnet3_free_shared_data(struct vmxnet3_softc *sc) { if (sc->vmx_rss != NULL) { vmxnet3_dma_free(sc, &sc->vmx_rss_dma); sc->vmx_rss = NULL; } if (sc->vmx_qs != NULL) { vmxnet3_dma_free(sc, &sc->vmx_qs_dma); sc->vmx_qs = NULL; } if (sc->vmx_ds != NULL) { vmxnet3_dma_free(sc, &sc->vmx_ds_dma); sc->vmx_ds = NULL; } } static int vmxnet3_alloc_txq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; size_t descsz, compsz; int i, q, error; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_ntxqueues; q++) { txq = &sc->vmx_txq[q]; txr = &txq->vxtxq_cmd_ring; txc = &txq->vxtxq_comp_ring; descsz = txr->vxtxr_ndesc * sizeof(struct vmxnet3_txdesc); compsz = txr->vxtxr_ndesc * sizeof(struct vmxnet3_txcompdesc); error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ VMXNET3_TX_MAXSIZE, /* maxsize */ VMXNET3_TX_MAXSEGS, /* nsegments */ VMXNET3_TX_MAXSEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &txr->vxtxr_txtag); if (error) { device_printf(dev, "unable to create Tx buffer tag for queue %d\n", q); return (error); } error = vmxnet3_dma_malloc(sc, descsz, 512, &txr->vxtxr_dma); if (error) { device_printf(dev, "cannot alloc Tx descriptors for " "queue %d error %d\n", q, error); return (error); } txr->vxtxr_txd = (struct vmxnet3_txdesc *) txr->vxtxr_dma.dma_vaddr; error = vmxnet3_dma_malloc(sc, compsz, 512, &txc->vxcr_dma); if (error) { device_printf(dev, "cannot alloc Tx comp descriptors " "for queue %d error %d\n", q, error); return (error); } txc->vxcr_u.txcd = (struct vmxnet3_txcompdesc *) txc->vxcr_dma.dma_vaddr; for (i = 0; i < txr->vxtxr_ndesc; i++) { error = bus_dmamap_create(txr->vxtxr_txtag, 0, &txr->vxtxr_txbuf[i].vtxb_dmamap); if (error) { device_printf(dev, "unable to create Tx buf " "dmamap for queue %d idx %d\n", q, i); return (error); } } } return (0); } static void vmxnet3_free_txq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; struct vmxnet3_txbuf *txb; int i, q; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_ntxqueues; q++) { txq = &sc->vmx_txq[q]; txr = &txq->vxtxq_cmd_ring; txc = &txq->vxtxq_comp_ring; for (i = 0; i < txr->vxtxr_ndesc; i++) { txb = &txr->vxtxr_txbuf[i]; if (txb->vtxb_dmamap != NULL) { bus_dmamap_destroy(txr->vxtxr_txtag, txb->vtxb_dmamap); txb->vtxb_dmamap = NULL; } } if (txc->vxcr_u.txcd != NULL) { vmxnet3_dma_free(sc, &txc->vxcr_dma); txc->vxcr_u.txcd = NULL; } if (txr->vxtxr_txd != NULL) { vmxnet3_dma_free(sc, &txr->vxtxr_dma); txr->vxtxr_txd = NULL; } if (txr->vxtxr_txtag != NULL) { bus_dma_tag_destroy(txr->vxtxr_txtag); txr->vxtxr_txtag = NULL; } } } static int vmxnet3_alloc_rxq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; int descsz, compsz; int i, j, q, error; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_nrxqueues; q++) { rxq = &sc->vmx_rxq[q]; rxc = &rxq->vxrxq_comp_ring; compsz = 0; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; descsz = rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxdesc); compsz += rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxcompdesc); error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUMPAGESIZE, /* maxsize */ 1, /* nsegments */ MJUMPAGESIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &rxr->vxrxr_rxtag); if (error) { device_printf(dev, "unable to create Rx buffer tag for " "queue %d\n", q); return (error); } error = vmxnet3_dma_malloc(sc, descsz, 512, &rxr->vxrxr_dma); if (error) { device_printf(dev, "cannot allocate Rx " "descriptors for queue %d/%d error %d\n", i, q, error); return (error); } rxr->vxrxr_rxd = (struct vmxnet3_rxdesc *) rxr->vxrxr_dma.dma_vaddr; } error = vmxnet3_dma_malloc(sc, compsz, 512, &rxc->vxcr_dma); if (error) { device_printf(dev, "cannot alloc Rx comp descriptors " "for queue %d error %d\n", q, error); return (error); } rxc->vxcr_u.rxcd = (struct vmxnet3_rxcompdesc *) rxc->vxcr_dma.dma_vaddr; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; error = bus_dmamap_create(rxr->vxrxr_rxtag, 0, &rxr->vxrxr_spare_dmap); if (error) { device_printf(dev, "unable to create spare " "dmamap for queue %d/%d error %d\n", q, i, error); return (error); } for (j = 0; j < rxr->vxrxr_ndesc; j++) { error = bus_dmamap_create(rxr->vxrxr_rxtag, 0, &rxr->vxrxr_rxbuf[j].vrxb_dmamap); if (error) { device_printf(dev, "unable to create " "dmamap for queue %d/%d slot %d " "error %d\n", q, i, j, error); return (error); } } } } return (0); } static void vmxnet3_free_rxq_data(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; struct vmxnet3_rxbuf *rxb; int i, j, q; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_nrxqueues; q++) { rxq = &sc->vmx_rxq[q]; rxc = &rxq->vxrxq_comp_ring; for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; if (rxr->vxrxr_spare_dmap != NULL) { bus_dmamap_destroy(rxr->vxrxr_rxtag, rxr->vxrxr_spare_dmap); rxr->vxrxr_spare_dmap = NULL; } for (j = 0; j < rxr->vxrxr_ndesc; j++) { rxb = &rxr->vxrxr_rxbuf[j]; if (rxb->vrxb_dmamap != NULL) { bus_dmamap_destroy(rxr->vxrxr_rxtag, rxb->vrxb_dmamap); rxb->vrxb_dmamap = NULL; } } } if (rxc->vxcr_u.rxcd != NULL) { vmxnet3_dma_free(sc, &rxc->vxcr_dma); rxc->vxcr_u.rxcd = NULL; } for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; if (rxr->vxrxr_rxd != NULL) { vmxnet3_dma_free(sc, &rxr->vxrxr_dma); rxr->vxrxr_rxd = NULL; } if (rxr->vxrxr_rxtag != NULL) { bus_dma_tag_destroy(rxr->vxrxr_rxtag); rxr->vxrxr_rxtag = NULL; } } } } static int vmxnet3_alloc_queue_data(struct vmxnet3_softc *sc) { int error; error = vmxnet3_alloc_txq_data(sc); if (error) return (error); error = vmxnet3_alloc_rxq_data(sc); if (error) return (error); return (0); } static void vmxnet3_free_queue_data(struct vmxnet3_softc *sc) { if (sc->vmx_rxq != NULL) vmxnet3_free_rxq_data(sc); if (sc->vmx_txq != NULL) vmxnet3_free_txq_data(sc); } static int vmxnet3_alloc_mcast_table(struct vmxnet3_softc *sc) { int error; error = vmxnet3_dma_malloc(sc, VMXNET3_MULTICAST_MAX * ETHER_ADDR_LEN, 32, &sc->vmx_mcast_dma); if (error) device_printf(sc->vmx_dev, "unable to alloc multicast table\n"); else sc->vmx_mcast = sc->vmx_mcast_dma.dma_vaddr; return (error); } static void vmxnet3_free_mcast_table(struct vmxnet3_softc *sc) { if (sc->vmx_mcast != NULL) { vmxnet3_dma_free(sc, &sc->vmx_mcast_dma); sc->vmx_mcast = NULL; } } static void vmxnet3_init_shared_data(struct vmxnet3_softc *sc) { struct vmxnet3_driver_shared *ds; struct vmxnet3_txqueue *txq; struct vmxnet3_txq_shared *txs; struct vmxnet3_rxqueue *rxq; struct vmxnet3_rxq_shared *rxs; int i; ds = sc->vmx_ds; /* * Initialize fields of the shared data that remains the same across * reinits. Note the shared data is zero'd when allocated. */ ds->magic = VMXNET3_REV1_MAGIC; /* DriverInfo */ ds->version = VMXNET3_DRIVER_VERSION; ds->guest = VMXNET3_GOS_FREEBSD | #ifdef __LP64__ VMXNET3_GOS_64BIT; #else VMXNET3_GOS_32BIT; #endif ds->vmxnet3_revision = 1; ds->upt_version = 1; /* Misc. conf */ ds->driver_data = vtophys(sc); ds->driver_data_len = sizeof(struct vmxnet3_softc); ds->queue_shared = sc->vmx_qs_dma.dma_paddr; ds->queue_shared_len = sc->vmx_qs_dma.dma_size; ds->nrxsg_max = sc->vmx_max_rxsegs; /* RSS conf */ if (sc->vmx_flags & VMXNET3_FLAG_RSS) { ds->rss.version = 1; ds->rss.paddr = sc->vmx_rss_dma.dma_paddr; ds->rss.len = sc->vmx_rss_dma.dma_size; } /* Interrupt control. */ ds->automask = sc->vmx_intr_mask_mode == VMXNET3_IMM_AUTO; ds->nintr = sc->vmx_nintrs; ds->evintr = sc->vmx_event_intr_idx; ds->ictrl = VMXNET3_ICTRL_DISABLE_ALL; for (i = 0; i < sc->vmx_nintrs; i++) ds->modlevel[i] = UPT1_IMOD_ADAPTIVE; /* Receive filter. */ ds->mcast_table = sc->vmx_mcast_dma.dma_paddr; ds->mcast_tablelen = sc->vmx_mcast_dma.dma_size; /* Tx queues */ for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; txs = txq->vxtxq_ts; txs->cmd_ring = txq->vxtxq_cmd_ring.vxtxr_dma.dma_paddr; txs->cmd_ring_len = txq->vxtxq_cmd_ring.vxtxr_ndesc; txs->comp_ring = txq->vxtxq_comp_ring.vxcr_dma.dma_paddr; txs->comp_ring_len = txq->vxtxq_comp_ring.vxcr_ndesc; txs->driver_data = vtophys(txq); txs->driver_data_len = sizeof(struct vmxnet3_txqueue); } /* Rx queues */ for (i = 0; i < sc->vmx_nrxqueues; i++) { rxq = &sc->vmx_rxq[i]; rxs = rxq->vxrxq_rs; rxs->cmd_ring[0] = rxq->vxrxq_cmd_ring[0].vxrxr_dma.dma_paddr; rxs->cmd_ring_len[0] = rxq->vxrxq_cmd_ring[0].vxrxr_ndesc; rxs->cmd_ring[1] = rxq->vxrxq_cmd_ring[1].vxrxr_dma.dma_paddr; rxs->cmd_ring_len[1] = rxq->vxrxq_cmd_ring[1].vxrxr_ndesc; rxs->comp_ring = rxq->vxrxq_comp_ring.vxcr_dma.dma_paddr; rxs->comp_ring_len = rxq->vxrxq_comp_ring.vxcr_ndesc; rxs->driver_data = vtophys(rxq); rxs->driver_data_len = sizeof(struct vmxnet3_rxqueue); } } static void vmxnet3_reinit_interface(struct vmxnet3_softc *sc) { struct ifnet *ifp; ifp = sc->vmx_ifp; /* Use the current MAC address. */ bcopy(IF_LLADDR(sc->vmx_ifp), sc->vmx_lladdr, ETHER_ADDR_LEN); vmxnet3_set_lladdr(sc); ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= VMXNET3_CSUM_OFFLOAD; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= VMXNET3_CSUM_OFFLOAD_IPV6; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; } static void vmxnet3_reinit_rss_shared_data(struct vmxnet3_softc *sc) { /* * Use the same key as the Linux driver until FreeBSD can do * RSS (presumably Toeplitz) in software. */ static const uint8_t rss_key[UPT1_RSS_MAX_KEY_SIZE] = { 0x3b, 0x56, 0xd1, 0x56, 0x13, 0x4a, 0xe7, 0xac, 0xe8, 0x79, 0x09, 0x75, 0xe8, 0x65, 0x79, 0x28, 0x35, 0x12, 0xb9, 0x56, 0x7c, 0x76, 0x4b, 0x70, 0xd8, 0x56, 0xa3, 0x18, 0x9b, 0x0a, 0xee, 0xf3, 0x96, 0xa6, 0x9f, 0x8f, 0x9e, 0x8c, 0x90, 0xc9, }; struct vmxnet3_driver_shared *ds; struct vmxnet3_rss_shared *rss; int i; ds = sc->vmx_ds; rss = sc->vmx_rss; rss->hash_type = UPT1_RSS_HASH_TYPE_IPV4 | UPT1_RSS_HASH_TYPE_TCP_IPV4 | UPT1_RSS_HASH_TYPE_IPV6 | UPT1_RSS_HASH_TYPE_TCP_IPV6; rss->hash_func = UPT1_RSS_HASH_FUNC_TOEPLITZ; rss->hash_key_size = UPT1_RSS_MAX_KEY_SIZE; rss->ind_table_size = UPT1_RSS_MAX_IND_TABLE_SIZE; memcpy(rss->hash_key, rss_key, UPT1_RSS_MAX_KEY_SIZE); for (i = 0; i < UPT1_RSS_MAX_IND_TABLE_SIZE; i++) rss->ind_table[i] = i % sc->vmx_nrxqueues; } static void vmxnet3_reinit_shared_data(struct vmxnet3_softc *sc) { struct ifnet *ifp; struct vmxnet3_driver_shared *ds; ifp = sc->vmx_ifp; ds = sc->vmx_ds; ds->mtu = ifp->if_mtu; ds->ntxqueue = sc->vmx_ntxqueues; ds->nrxqueue = sc->vmx_nrxqueues; ds->upt_features = 0; if (ifp->if_capenable & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) ds->upt_features |= UPT1_F_CSUM; if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) ds->upt_features |= UPT1_F_VLAN; if (ifp->if_capenable & IFCAP_LRO) ds->upt_features |= UPT1_F_LRO; if (sc->vmx_flags & VMXNET3_FLAG_RSS) { ds->upt_features |= UPT1_F_RSS; vmxnet3_reinit_rss_shared_data(sc); } vmxnet3_write_bar1(sc, VMXNET3_BAR1_DSL, sc->vmx_ds_dma.dma_paddr); vmxnet3_write_bar1(sc, VMXNET3_BAR1_DSH, (uint64_t) sc->vmx_ds_dma.dma_paddr >> 32); } static int vmxnet3_alloc_data(struct vmxnet3_softc *sc) { int error; error = vmxnet3_alloc_shared_data(sc); if (error) return (error); error = vmxnet3_alloc_queue_data(sc); if (error) return (error); error = vmxnet3_alloc_mcast_table(sc); if (error) return (error); vmxnet3_init_shared_data(sc); return (0); } static void vmxnet3_free_data(struct vmxnet3_softc *sc) { vmxnet3_free_mcast_table(sc); vmxnet3_free_queue_data(sc); vmxnet3_free_shared_data(sc); } static int vmxnet3_setup_interface(struct vmxnet3_softc *sc) { device_t dev; struct ifnet *ifp; dev = sc->vmx_dev; ifp = sc->vmx_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "cannot allocate ifnet structure\n"); return (ENOSPC); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); #if __FreeBSD_version < 1000025 ifp->if_baudrate = 1000000000; #elif __FreeBSD_version < 1100011 if_initbaudrate(ifp, IF_Gbps(10)); #else ifp->if_baudrate = IF_Gbps(10); #endif ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = vmxnet3_init; ifp->if_ioctl = vmxnet3_ioctl; ifp->if_get_counter = vmxnet3_get_counter; ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = VMXNET3_TX_MAXSEGS; ifp->if_hw_tsomaxsegsize = VMXNET3_TX_MAXSEGSIZE; #ifdef VMXNET3_LEGACY_TX ifp->if_start = vmxnet3_start; ifp->if_snd.ifq_drv_maxlen = sc->vmx_ntxdescs - 1; IFQ_SET_MAXLEN(&ifp->if_snd, sc->vmx_ntxdescs - 1); IFQ_SET_READY(&ifp->if_snd); #else ifp->if_transmit = vmxnet3_txq_mq_start; ifp->if_qflush = vmxnet3_qflush; #endif vmxnet3_get_lladdr(sc); ether_ifattach(ifp, sc->vmx_lladdr); ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM; ifp->if_capabilities |= IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6; ifp->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6; ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; ifp->if_capenable = ifp->if_capabilities; /* These capabilities are not enabled by default. */ ifp->if_capabilities |= IFCAP_LRO | IFCAP_VLAN_HWFILTER; sc->vmx_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, vmxnet3_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vmx_vlan_detach = EVENTHANDLER_REGISTER(vlan_config, vmxnet3_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); ifmedia_init(&sc->vmx_media, 0, vmxnet3_media_change, vmxnet3_media_status); ifmedia_add(&sc->vmx_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->vmx_media, IFM_ETHER | IFM_AUTO); return (0); } static void vmxnet3_evintr(struct vmxnet3_softc *sc) { device_t dev; struct ifnet *ifp; struct vmxnet3_txq_shared *ts; struct vmxnet3_rxq_shared *rs; uint32_t event; int reset; dev = sc->vmx_dev; ifp = sc->vmx_ifp; reset = 0; VMXNET3_CORE_LOCK(sc); /* Clear events. */ event = sc->vmx_ds->event; vmxnet3_write_bar1(sc, VMXNET3_BAR1_EVENT, event); if (event & VMXNET3_EVENT_LINK) { vmxnet3_link_status(sc); if (sc->vmx_link_active != 0) vmxnet3_tx_start_all(sc); } if (event & (VMXNET3_EVENT_TQERROR | VMXNET3_EVENT_RQERROR)) { reset = 1; vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_STATUS); ts = sc->vmx_txq[0].vxtxq_ts; if (ts->stopped != 0) device_printf(dev, "Tx queue error %#x\n", ts->error); rs = sc->vmx_rxq[0].vxrxq_rs; if (rs->stopped != 0) device_printf(dev, "Rx queue error %#x\n", rs->error); device_printf(dev, "Rx/Tx queue error event ... resetting\n"); } if (event & VMXNET3_EVENT_DIC) device_printf(dev, "device implementation change event\n"); if (event & VMXNET3_EVENT_DEBUG) device_printf(dev, "debug event\n"); if (reset != 0) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } VMXNET3_CORE_UNLOCK(sc); } static void vmxnet3_txq_eof(struct vmxnet3_txqueue *txq) { struct vmxnet3_softc *sc; struct ifnet *ifp; struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; struct vmxnet3_txcompdesc *txcd; struct vmxnet3_txbuf *txb; struct mbuf *m; u_int sop; sc = txq->vxtxq_sc; ifp = sc->vmx_ifp; txr = &txq->vxtxq_cmd_ring; txc = &txq->vxtxq_comp_ring; VMXNET3_TXQ_LOCK_ASSERT(txq); for (;;) { txcd = &txc->vxcr_u.txcd[txc->vxcr_next]; if (txcd->gen != txc->vxcr_gen) break; vmxnet3_barrier(sc, VMXNET3_BARRIER_RD); if (++txc->vxcr_next == txc->vxcr_ndesc) { txc->vxcr_next = 0; txc->vxcr_gen ^= 1; } sop = txr->vxtxr_next; txb = &txr->vxtxr_txbuf[sop]; if ((m = txb->vtxb_m) != NULL) { bus_dmamap_sync(txr->vxtxr_txtag, txb->vtxb_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->vxtxr_txtag, txb->vtxb_dmamap); txq->vxtxq_stats.vmtxs_opackets++; txq->vxtxq_stats.vmtxs_obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) txq->vxtxq_stats.vmtxs_omcasts++; m_freem(m); txb->vtxb_m = NULL; } txr->vxtxr_next = (txcd->eop_idx + 1) % txr->vxtxr_ndesc; } if (txr->vxtxr_head == txr->vxtxr_next) txq->vxtxq_watchdog = 0; } static int vmxnet3_newbuf(struct vmxnet3_softc *sc, struct vmxnet3_rxring *rxr) { struct ifnet *ifp; struct mbuf *m; struct vmxnet3_rxdesc *rxd; struct vmxnet3_rxbuf *rxb; bus_dma_tag_t tag; bus_dmamap_t dmap; bus_dma_segment_t segs[1]; int idx, clsize, btype, flags, nsegs, error; ifp = sc->vmx_ifp; tag = rxr->vxrxr_rxtag; dmap = rxr->vxrxr_spare_dmap; idx = rxr->vxrxr_fill; rxd = &rxr->vxrxr_rxd[idx]; rxb = &rxr->vxrxr_rxbuf[idx]; #ifdef VMXNET3_FAILPOINTS KFAIL_POINT_CODE(VMXNET3_FP, newbuf, return ENOBUFS); if (rxr->vxrxr_rid != 0) KFAIL_POINT_CODE(VMXNET3_FP, newbuf_body_only, return ENOBUFS); #endif if (rxr->vxrxr_rid == 0 && (idx % sc->vmx_rx_max_chain) == 0) { flags = M_PKTHDR; clsize = MCLBYTES; btype = VMXNET3_BTYPE_HEAD; } else { #if __FreeBSD_version < 902001 /* * These mbufs will never be used for the start of a frame. * Roughly prior to branching releng/9.2, the load_mbuf_sg() * required the mbuf to always be a packet header. Avoid * unnecessary mbuf initialization in newer versions where * that is not the case. */ flags = M_PKTHDR; #else flags = 0; #endif clsize = MJUMPAGESIZE; btype = VMXNET3_BTYPE_BODY; } m = m_getjcl(M_NOWAIT, MT_DATA, flags, clsize); if (m == NULL) { sc->vmx_stats.vmst_mgetcl_failed++; return (ENOBUFS); } if (btype == VMXNET3_BTYPE_HEAD) { m->m_len = m->m_pkthdr.len = clsize; m_adj(m, ETHER_ALIGN); } else m->m_len = clsize; error = bus_dmamap_load_mbuf_sg(tag, dmap, m, &segs[0], &nsegs, BUS_DMA_NOWAIT); if (error) { m_freem(m); sc->vmx_stats.vmst_mbuf_load_failed++; return (error); } KASSERT(nsegs == 1, ("%s: mbuf %p with too many segments %d", __func__, m, nsegs)); #if __FreeBSD_version < 902001 if (btype == VMXNET3_BTYPE_BODY) m->m_flags &= ~M_PKTHDR; #endif if (rxb->vrxb_m != NULL) { bus_dmamap_sync(tag, rxb->vrxb_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(tag, rxb->vrxb_dmamap); } rxr->vxrxr_spare_dmap = rxb->vrxb_dmamap; rxb->vrxb_dmamap = dmap; rxb->vrxb_m = m; rxd->addr = segs[0].ds_addr; rxd->len = segs[0].ds_len; rxd->btype = btype; rxd->gen = rxr->vxrxr_gen; vmxnet3_rxr_increment_fill(rxr); return (0); } static void vmxnet3_rxq_eof_discard(struct vmxnet3_rxqueue *rxq, struct vmxnet3_rxring *rxr, int idx) { struct vmxnet3_rxdesc *rxd; rxd = &rxr->vxrxr_rxd[idx]; rxd->gen = rxr->vxrxr_gen; vmxnet3_rxr_increment_fill(rxr); } static void vmxnet3_rxq_discard_chain(struct vmxnet3_rxqueue *rxq) { struct vmxnet3_softc *sc; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; struct vmxnet3_rxcompdesc *rxcd; int idx, eof; sc = rxq->vxrxq_sc; rxc = &rxq->vxrxq_comp_ring; do { rxcd = &rxc->vxcr_u.rxcd[rxc->vxcr_next]; if (rxcd->gen != rxc->vxcr_gen) break; /* Not expected. */ vmxnet3_barrier(sc, VMXNET3_BARRIER_RD); if (++rxc->vxcr_next == rxc->vxcr_ndesc) { rxc->vxcr_next = 0; rxc->vxcr_gen ^= 1; } idx = rxcd->rxd_idx; eof = rxcd->eop; if (rxcd->qid < sc->vmx_nrxqueues) rxr = &rxq->vxrxq_cmd_ring[0]; else rxr = &rxq->vxrxq_cmd_ring[1]; vmxnet3_rxq_eof_discard(rxq, rxr, idx); } while (!eof); } static void vmxnet3_rx_csum(struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m) { if (rxcd->ipv4) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; if (rxcd->ipcsum_ok) m->m_pkthdr.csum_flags |= CSUM_IP_VALID; } if (!rxcd->fragment) { if (rxcd->csum_ok && (rxcd->tcp || rxcd->udp)) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; } } } static void vmxnet3_rxq_input(struct vmxnet3_rxqueue *rxq, struct vmxnet3_rxcompdesc *rxcd, struct mbuf *m) { struct vmxnet3_softc *sc; struct ifnet *ifp; sc = rxq->vxrxq_sc; ifp = sc->vmx_ifp; if (rxcd->error) { rxq->vxrxq_stats.vmrxs_ierrors++; m_freem(m); return; } #ifdef notyet switch (rxcd->rss_type) { case VMXNET3_RCD_RSS_TYPE_IPV4: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV4); break; case VMXNET3_RCD_RSS_TYPE_TCPIPV4: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_TCP_IPV4); break; case VMXNET3_RCD_RSS_TYPE_IPV6: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV6); break; case VMXNET3_RCD_RSS_TYPE_TCPIPV6: m->m_pkthdr.flowid = rxcd->rss_hash; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_TCP_IPV6); break; default: /* VMXNET3_RCD_RSS_TYPE_NONE */ m->m_pkthdr.flowid = rxq->vxrxq_id; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); break; } #else m->m_pkthdr.flowid = rxq->vxrxq_id; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); #endif if (!rxcd->no_csum) vmxnet3_rx_csum(rxcd, m); if (rxcd->vlan) { m->m_flags |= M_VLANTAG; m->m_pkthdr.ether_vtag = rxcd->vtag; } rxq->vxrxq_stats.vmrxs_ipackets++; rxq->vxrxq_stats.vmrxs_ibytes += m->m_pkthdr.len; VMXNET3_RXQ_UNLOCK(rxq); (*ifp->if_input)(ifp, m); VMXNET3_RXQ_LOCK(rxq); } static void vmxnet3_rxq_eof(struct vmxnet3_rxqueue *rxq) { struct vmxnet3_softc *sc; struct ifnet *ifp; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; struct vmxnet3_rxdesc *rxd; struct vmxnet3_rxcompdesc *rxcd; struct mbuf *m, *m_head, *m_tail; int idx, length; sc = rxq->vxrxq_sc; ifp = sc->vmx_ifp; rxc = &rxq->vxrxq_comp_ring; VMXNET3_RXQ_LOCK_ASSERT(rxq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; m_head = rxq->vxrxq_mhead; rxq->vxrxq_mhead = NULL; m_tail = rxq->vxrxq_mtail; rxq->vxrxq_mtail = NULL; MPASS(m_head == NULL || m_tail != NULL); for (;;) { rxcd = &rxc->vxcr_u.rxcd[rxc->vxcr_next]; if (rxcd->gen != rxc->vxcr_gen) { rxq->vxrxq_mhead = m_head; rxq->vxrxq_mtail = m_tail; break; } vmxnet3_barrier(sc, VMXNET3_BARRIER_RD); if (++rxc->vxcr_next == rxc->vxcr_ndesc) { rxc->vxcr_next = 0; rxc->vxcr_gen ^= 1; } idx = rxcd->rxd_idx; length = rxcd->len; if (rxcd->qid < sc->vmx_nrxqueues) rxr = &rxq->vxrxq_cmd_ring[0]; else rxr = &rxq->vxrxq_cmd_ring[1]; rxd = &rxr->vxrxr_rxd[idx]; m = rxr->vxrxr_rxbuf[idx].vrxb_m; KASSERT(m != NULL, ("%s: queue %d idx %d without mbuf", __func__, rxcd->qid, idx)); /* * The host may skip descriptors. We detect this when this * descriptor does not match the previous fill index. Catch * up with the host now. */ if (__predict_false(rxr->vxrxr_fill != idx)) { while (rxr->vxrxr_fill != idx) { rxr->vxrxr_rxd[rxr->vxrxr_fill].gen = rxr->vxrxr_gen; vmxnet3_rxr_increment_fill(rxr); } } if (rxcd->sop) { KASSERT(rxd->btype == VMXNET3_BTYPE_HEAD, ("%s: start of frame w/o head buffer", __func__)); KASSERT(rxr == &rxq->vxrxq_cmd_ring[0], ("%s: start of frame not in ring 0", __func__)); KASSERT((idx % sc->vmx_rx_max_chain) == 0, ("%s: start of frame at unexcepted index %d (%d)", __func__, idx, sc->vmx_rx_max_chain)); KASSERT(m_head == NULL, ("%s: duplicate start of frame?", __func__)); if (length == 0) { /* Just ignore this descriptor. */ vmxnet3_rxq_eof_discard(rxq, rxr, idx); goto nextp; } if (vmxnet3_newbuf(sc, rxr) != 0) { rxq->vxrxq_stats.vmrxs_iqdrops++; vmxnet3_rxq_eof_discard(rxq, rxr, idx); if (!rxcd->eop) vmxnet3_rxq_discard_chain(rxq); goto nextp; } m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = m->m_len = length; m->m_pkthdr.csum_flags = 0; m_head = m_tail = m; } else { KASSERT(rxd->btype == VMXNET3_BTYPE_BODY, ("%s: non start of frame w/o body buffer", __func__)); KASSERT(m_head != NULL, ("%s: frame not started?", __func__)); if (vmxnet3_newbuf(sc, rxr) != 0) { rxq->vxrxq_stats.vmrxs_iqdrops++; vmxnet3_rxq_eof_discard(rxq, rxr, idx); if (!rxcd->eop) vmxnet3_rxq_discard_chain(rxq); m_freem(m_head); m_head = m_tail = NULL; goto nextp; } m->m_len = length; m_head->m_pkthdr.len += length; m_tail->m_next = m; m_tail = m; } if (rxcd->eop) { vmxnet3_rxq_input(rxq, rxcd, m_head); m_head = m_tail = NULL; /* Must recheck after dropping the Rx lock. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } nextp: if (__predict_false(rxq->vxrxq_rs->update_rxhead)) { int qid = rxcd->qid; bus_size_t r; idx = (idx + 1) % rxr->vxrxr_ndesc; if (qid >= sc->vmx_nrxqueues) { qid -= sc->vmx_nrxqueues; r = VMXNET3_BAR0_RXH2(qid); } else r = VMXNET3_BAR0_RXH1(qid); vmxnet3_write_bar0(sc, r, idx); } } } static void vmxnet3_legacy_intr(void *xsc) { struct vmxnet3_softc *sc; struct vmxnet3_rxqueue *rxq; struct vmxnet3_txqueue *txq; sc = xsc; rxq = &sc->vmx_rxq[0]; txq = &sc->vmx_txq[0]; if (sc->vmx_intr_type == VMXNET3_IT_LEGACY) { if (vmxnet3_read_bar1(sc, VMXNET3_BAR1_INTR) == 0) return; } if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_all_intrs(sc); if (sc->vmx_ds->event != 0) vmxnet3_evintr(sc); VMXNET3_RXQ_LOCK(rxq); vmxnet3_rxq_eof(rxq); VMXNET3_RXQ_UNLOCK(rxq); VMXNET3_TXQ_LOCK(txq); vmxnet3_txq_eof(txq); vmxnet3_txq_start(txq); VMXNET3_TXQ_UNLOCK(txq); vmxnet3_enable_all_intrs(sc); } static void vmxnet3_txq_intr(void *xtxq) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; txq = xtxq; sc = txq->vxtxq_sc; if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_intr(sc, txq->vxtxq_intr_idx); VMXNET3_TXQ_LOCK(txq); vmxnet3_txq_eof(txq); vmxnet3_txq_start(txq); VMXNET3_TXQ_UNLOCK(txq); vmxnet3_enable_intr(sc, txq->vxtxq_intr_idx); } static void vmxnet3_rxq_intr(void *xrxq) { struct vmxnet3_softc *sc; struct vmxnet3_rxqueue *rxq; rxq = xrxq; sc = rxq->vxrxq_sc; if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_intr(sc, rxq->vxrxq_intr_idx); VMXNET3_RXQ_LOCK(rxq); vmxnet3_rxq_eof(rxq); VMXNET3_RXQ_UNLOCK(rxq); vmxnet3_enable_intr(sc, rxq->vxrxq_intr_idx); } static void vmxnet3_event_intr(void *xsc) { struct vmxnet3_softc *sc; sc = xsc; if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE) vmxnet3_disable_intr(sc, sc->vmx_event_intr_idx); if (sc->vmx_ds->event != 0) vmxnet3_evintr(sc); vmxnet3_enable_intr(sc, sc->vmx_event_intr_idx); } static void vmxnet3_txstop(struct vmxnet3_softc *sc, struct vmxnet3_txqueue *txq) { struct vmxnet3_txring *txr; struct vmxnet3_txbuf *txb; int i; txr = &txq->vxtxq_cmd_ring; for (i = 0; i < txr->vxtxr_ndesc; i++) { txb = &txr->vxtxr_txbuf[i]; if (txb->vtxb_m == NULL) continue; bus_dmamap_sync(txr->vxtxr_txtag, txb->vtxb_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->vxtxr_txtag, txb->vtxb_dmamap); m_freem(txb->vtxb_m); txb->vtxb_m = NULL; } } static void vmxnet3_rxstop(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rxq) { struct vmxnet3_rxring *rxr; struct vmxnet3_rxbuf *rxb; int i, j; if (rxq->vxrxq_mhead != NULL) { m_freem(rxq->vxrxq_mhead); rxq->vxrxq_mhead = NULL; rxq->vxrxq_mtail = NULL; } for (i = 0; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; for (j = 0; j < rxr->vxrxr_ndesc; j++) { rxb = &rxr->vxrxr_rxbuf[j]; if (rxb->vrxb_m == NULL) continue; bus_dmamap_sync(rxr->vxrxr_rxtag, rxb->vrxb_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->vxrxr_rxtag, rxb->vrxb_dmamap); m_freem(rxb->vrxb_m); rxb->vrxb_m = NULL; } } } static void vmxnet3_stop_rendezvous(struct vmxnet3_softc *sc) { struct vmxnet3_rxqueue *rxq; struct vmxnet3_txqueue *txq; int i; for (i = 0; i < sc->vmx_nrxqueues; i++) { rxq = &sc->vmx_rxq[i]; VMXNET3_RXQ_LOCK(rxq); VMXNET3_RXQ_UNLOCK(rxq); } for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; VMXNET3_TXQ_LOCK(txq); VMXNET3_TXQ_UNLOCK(txq); } } static void vmxnet3_stop(struct vmxnet3_softc *sc) { struct ifnet *ifp; int q; ifp = sc->vmx_ifp; VMXNET3_CORE_LOCK_ASSERT(sc); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; sc->vmx_link_active = 0; callout_stop(&sc->vmx_tick); /* Disable interrupts. */ vmxnet3_disable_all_intrs(sc); vmxnet3_write_cmd(sc, VMXNET3_CMD_DISABLE); vmxnet3_stop_rendezvous(sc); for (q = 0; q < sc->vmx_ntxqueues; q++) vmxnet3_txstop(sc, &sc->vmx_txq[q]); for (q = 0; q < sc->vmx_nrxqueues; q++) vmxnet3_rxstop(sc, &sc->vmx_rxq[q]); vmxnet3_write_cmd(sc, VMXNET3_CMD_RESET); } static void vmxnet3_txinit(struct vmxnet3_softc *sc, struct vmxnet3_txqueue *txq) { struct vmxnet3_txring *txr; struct vmxnet3_comp_ring *txc; txr = &txq->vxtxq_cmd_ring; txr->vxtxr_head = 0; txr->vxtxr_next = 0; txr->vxtxr_gen = VMXNET3_INIT_GEN; bzero(txr->vxtxr_txd, txr->vxtxr_ndesc * sizeof(struct vmxnet3_txdesc)); txc = &txq->vxtxq_comp_ring; txc->vxcr_next = 0; txc->vxcr_gen = VMXNET3_INIT_GEN; bzero(txc->vxcr_u.txcd, txc->vxcr_ndesc * sizeof(struct vmxnet3_txcompdesc)); } static int vmxnet3_rxinit(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rxq) { struct ifnet *ifp; struct vmxnet3_rxring *rxr; struct vmxnet3_comp_ring *rxc; int i, populate, idx, frame_size, error; ifp = sc->vmx_ifp; frame_size = ETHER_ALIGN + sizeof(struct ether_vlan_header) + ifp->if_mtu; /* * If the MTU causes us to exceed what a regular sized cluster can * handle, we allocate a second MJUMPAGESIZE cluster after it in * ring 0. If in use, ring 1 always contains MJUMPAGESIZE clusters. * * Keep rx_max_chain a divisor of the maximum Rx ring size to make * our life easier. We do not support changing the ring size after * the attach. */ if (frame_size <= MCLBYTES) sc->vmx_rx_max_chain = 1; else sc->vmx_rx_max_chain = 2; /* * Only populate ring 1 if the configuration will take advantage * of it. That is either when LRO is enabled or the frame size * exceeds what ring 0 can contain. */ if ((ifp->if_capenable & IFCAP_LRO) == 0 && frame_size <= MCLBYTES + MJUMPAGESIZE) populate = 1; else populate = VMXNET3_RXRINGS_PERQ; for (i = 0; i < populate; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; rxr->vxrxr_fill = 0; rxr->vxrxr_gen = VMXNET3_INIT_GEN; bzero(rxr->vxrxr_rxd, rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxdesc)); for (idx = 0; idx < rxr->vxrxr_ndesc; idx++) { error = vmxnet3_newbuf(sc, rxr); if (error) return (error); } } for (/**/; i < VMXNET3_RXRINGS_PERQ; i++) { rxr = &rxq->vxrxq_cmd_ring[i]; rxr->vxrxr_fill = 0; rxr->vxrxr_gen = 0; bzero(rxr->vxrxr_rxd, rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxdesc)); } rxc = &rxq->vxrxq_comp_ring; rxc->vxcr_next = 0; rxc->vxcr_gen = VMXNET3_INIT_GEN; bzero(rxc->vxcr_u.rxcd, rxc->vxcr_ndesc * sizeof(struct vmxnet3_rxcompdesc)); return (0); } static int vmxnet3_reinit_queues(struct vmxnet3_softc *sc) { device_t dev; int q, error; dev = sc->vmx_dev; for (q = 0; q < sc->vmx_ntxqueues; q++) vmxnet3_txinit(sc, &sc->vmx_txq[q]); for (q = 0; q < sc->vmx_nrxqueues; q++) { error = vmxnet3_rxinit(sc, &sc->vmx_rxq[q]); if (error) { device_printf(dev, "cannot populate Rx queue %d\n", q); return (error); } } return (0); } static int vmxnet3_enable_device(struct vmxnet3_softc *sc) { int q; if (vmxnet3_read_cmd(sc, VMXNET3_CMD_ENABLE) != 0) { device_printf(sc->vmx_dev, "device enable command failed!\n"); return (1); } /* Reset the Rx queue heads. */ for (q = 0; q < sc->vmx_nrxqueues; q++) { vmxnet3_write_bar0(sc, VMXNET3_BAR0_RXH1(q), 0); vmxnet3_write_bar0(sc, VMXNET3_BAR0_RXH2(q), 0); } return (0); } static void vmxnet3_reinit_rxfilters(struct vmxnet3_softc *sc) { struct ifnet *ifp; ifp = sc->vmx_ifp; vmxnet3_set_rxfilter(sc); if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) bcopy(sc->vmx_vlan_filter, sc->vmx_ds->vlan_filter, sizeof(sc->vmx_ds->vlan_filter)); else bzero(sc->vmx_ds->vlan_filter, sizeof(sc->vmx_ds->vlan_filter)); vmxnet3_write_cmd(sc, VMXNET3_CMD_VLAN_FILTER); } static int vmxnet3_reinit(struct vmxnet3_softc *sc) { vmxnet3_reinit_interface(sc); vmxnet3_reinit_shared_data(sc); if (vmxnet3_reinit_queues(sc) != 0) return (ENXIO); if (vmxnet3_enable_device(sc) != 0) return (ENXIO); vmxnet3_reinit_rxfilters(sc); return (0); } static void vmxnet3_init_locked(struct vmxnet3_softc *sc) { struct ifnet *ifp; ifp = sc->vmx_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; vmxnet3_stop(sc); if (vmxnet3_reinit(sc) != 0) { vmxnet3_stop(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; vmxnet3_link_status(sc); vmxnet3_enable_all_intrs(sc); callout_reset(&sc->vmx_tick, hz, vmxnet3_tick, sc); } static void vmxnet3_init(void *xsc) { struct vmxnet3_softc *sc; sc = xsc; VMXNET3_CORE_LOCK(sc); vmxnet3_init_locked(sc); VMXNET3_CORE_UNLOCK(sc); } /* * BMV: Much of this can go away once we finally have offsets in * the mbuf packet header. Bug andre@. */ static int vmxnet3_txq_offload_ctx(struct vmxnet3_txqueue *txq, struct mbuf *m, int *etype, int *proto, int *start) { struct ether_vlan_header *evh; int offset; #if defined(INET) struct ip *ip = NULL; struct ip iphdr; #endif #if defined(INET6) struct ip6_hdr *ip6 = NULL; struct ip6_hdr ip6hdr; #endif evh = mtod(m, struct ether_vlan_header *); if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { /* BMV: We should handle nested VLAN tags too. */ *etype = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else { *etype = ntohs(evh->evl_encap_proto); offset = sizeof(struct ether_header); } switch (*etype) { #if defined(INET) case ETHERTYPE_IP: if (__predict_false(m->m_len < offset + sizeof(struct ip))) { m_copydata(m, offset, sizeof(struct ip), (caddr_t) &iphdr); ip = &iphdr; } else ip = mtodo(m, offset); *proto = ip->ip_p; *start = offset + (ip->ip_hl << 2); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(m->m_len < offset + sizeof(struct ip6_hdr))) { m_copydata(m, offset, sizeof(struct ip6_hdr), (caddr_t) &ip6hdr); ip6 = &ip6hdr; } else ip6 = mtodo(m, offset); *proto = -1; *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); /* Assert the network stack sent us a valid packet. */ KASSERT(*start > offset, ("%s: mbuf %p start %d offset %d proto %d", __func__, m, *start, offset, *proto)); break; #endif default: return (EINVAL); } if (m->m_pkthdr.csum_flags & CSUM_TSO) { struct tcphdr *tcp, tcphdr; uint16_t sum; if (__predict_false(*proto != IPPROTO_TCP)) { /* Likely failed to correctly parse the mbuf. */ return (EINVAL); } txq->vxtxq_stats.vmtxs_tso++; switch (*etype) { #if defined(INET) case ETHERTYPE_IP: sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); break; #endif default: sum = 0; break; } if (m->m_len < *start + sizeof(struct tcphdr)) { m_copyback(m, *start + offsetof(struct tcphdr, th_sum), sizeof(uint16_t), (caddr_t) &sum); m_copydata(m, *start, sizeof(struct tcphdr), (caddr_t) &tcphdr); tcp = &tcphdr; } else { tcp = mtodo(m, *start); tcp->th_sum = sum; } /* * For TSO, the size of the protocol header is also * included in the descriptor header size. */ *start += (tcp->th_off << 2); } else txq->vxtxq_stats.vmtxs_csum++; return (0); } static int vmxnet3_txq_load_mbuf(struct vmxnet3_txqueue *txq, struct mbuf **m0, bus_dmamap_t dmap, bus_dma_segment_t segs[], int *nsegs) { struct vmxnet3_txring *txr; struct mbuf *m; bus_dma_tag_t tag; int error; txr = &txq->vxtxq_cmd_ring; m = *m0; tag = txr->vxtxr_txtag; error = bus_dmamap_load_mbuf_sg(tag, dmap, m, segs, nsegs, 0); if (error == 0 || error != EFBIG) return (error); m = m_defrag(m, M_NOWAIT); if (m != NULL) { *m0 = m; error = bus_dmamap_load_mbuf_sg(tag, dmap, m, segs, nsegs, 0); } else error = ENOBUFS; if (error) { m_freem(*m0); *m0 = NULL; txq->vxtxq_sc->vmx_stats.vmst_defrag_failed++; } else txq->vxtxq_sc->vmx_stats.vmst_defragged++; return (error); } static void vmxnet3_txq_unload_mbuf(struct vmxnet3_txqueue *txq, bus_dmamap_t dmap) { struct vmxnet3_txring *txr; txr = &txq->vxtxq_cmd_ring; bus_dmamap_unload(txr->vxtxr_txtag, dmap); } static int vmxnet3_txq_encap(struct vmxnet3_txqueue *txq, struct mbuf **m0) { struct vmxnet3_softc *sc; struct vmxnet3_txring *txr; struct vmxnet3_txdesc *txd, *sop; struct mbuf *m; bus_dmamap_t dmap; bus_dma_segment_t segs[VMXNET3_TX_MAXSEGS]; int i, gen, nsegs, etype, proto, start, error; sc = txq->vxtxq_sc; start = 0; txd = NULL; txr = &txq->vxtxq_cmd_ring; dmap = txr->vxtxr_txbuf[txr->vxtxr_head].vtxb_dmamap; error = vmxnet3_txq_load_mbuf(txq, m0, dmap, segs, &nsegs); if (error) return (error); m = *m0; M_ASSERTPKTHDR(m); KASSERT(nsegs <= VMXNET3_TX_MAXSEGS, ("%s: mbuf %p with too many segments %d", __func__, m, nsegs)); if (VMXNET3_TXRING_AVAIL(txr) < nsegs) { txq->vxtxq_stats.vmtxs_full++; vmxnet3_txq_unload_mbuf(txq, dmap); return (ENOSPC); } else if (m->m_pkthdr.csum_flags & VMXNET3_CSUM_ALL_OFFLOAD) { error = vmxnet3_txq_offload_ctx(txq, m, &etype, &proto, &start); if (error) { txq->vxtxq_stats.vmtxs_offload_failed++; vmxnet3_txq_unload_mbuf(txq, dmap); m_freem(m); *m0 = NULL; return (error); } } txr->vxtxr_txbuf[txr->vxtxr_head].vtxb_m = m; sop = &txr->vxtxr_txd[txr->vxtxr_head]; gen = txr->vxtxr_gen ^ 1; /* Owned by cpu (yet) */ for (i = 0; i < nsegs; i++) { txd = &txr->vxtxr_txd[txr->vxtxr_head]; txd->addr = segs[i].ds_addr; txd->len = segs[i].ds_len; txd->gen = gen; txd->dtype = 0; txd->offload_mode = VMXNET3_OM_NONE; txd->offload_pos = 0; txd->hlen = 0; txd->eop = 0; txd->compreq = 0; txd->vtag_mode = 0; txd->vtag = 0; if (++txr->vxtxr_head == txr->vxtxr_ndesc) { txr->vxtxr_head = 0; txr->vxtxr_gen ^= 1; } gen = txr->vxtxr_gen; } txd->eop = 1; txd->compreq = 1; if (m->m_flags & M_VLANTAG) { sop->vtag_mode = 1; sop->vtag = m->m_pkthdr.ether_vtag; } if (m->m_pkthdr.csum_flags & CSUM_TSO) { sop->offload_mode = VMXNET3_OM_TSO; sop->hlen = start; sop->offload_pos = m->m_pkthdr.tso_segsz; } else if (m->m_pkthdr.csum_flags & (VMXNET3_CSUM_OFFLOAD | VMXNET3_CSUM_OFFLOAD_IPV6)) { sop->offload_mode = VMXNET3_OM_CSUM; sop->hlen = start; sop->offload_pos = start + m->m_pkthdr.csum_data; } /* Finally, change the ownership. */ vmxnet3_barrier(sc, VMXNET3_BARRIER_WR); sop->gen ^= 1; txq->vxtxq_ts->npending += nsegs; if (txq->vxtxq_ts->npending >= txq->vxtxq_ts->intr_threshold) { txq->vxtxq_ts->npending = 0; vmxnet3_write_bar0(sc, VMXNET3_BAR0_TXH(txq->vxtxq_id), txr->vxtxr_head); } return (0); } #ifdef VMXNET3_LEGACY_TX static void vmxnet3_start_locked(struct ifnet *ifp) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; struct vmxnet3_txring *txr; struct mbuf *m_head; int tx, avail; sc = ifp->if_softc; txq = &sc->vmx_txq[0]; txr = &txq->vxtxq_cmd_ring; tx = 0; VMXNET3_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vmx_link_active == 0) return; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if ((avail = VMXNET3_TXRING_AVAIL(txr)) < 2) break; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* Assume worse case if this mbuf is the head of a chain. */ if (m_head->m_next != NULL && avail < VMXNET3_TX_MAXSEGS) { IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } if (vmxnet3_txq_encap(txq, &m_head) != 0) { if (m_head != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m_head); break; } tx++; ETHER_BPF_MTAP(ifp, m_head); } if (tx > 0) txq->vxtxq_watchdog = VMXNET3_WATCHDOG_TIMEOUT; } static void vmxnet3_start(struct ifnet *ifp) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; sc = ifp->if_softc; txq = &sc->vmx_txq[0]; VMXNET3_TXQ_LOCK(txq); vmxnet3_start_locked(ifp); VMXNET3_TXQ_UNLOCK(txq); } #else /* !VMXNET3_LEGACY_TX */ static int vmxnet3_txq_mq_start_locked(struct vmxnet3_txqueue *txq, struct mbuf *m) { struct vmxnet3_softc *sc; struct vmxnet3_txring *txr; struct buf_ring *br; struct ifnet *ifp; int tx, avail, error; sc = txq->vxtxq_sc; br = txq->vxtxq_br; ifp = sc->vmx_ifp; txr = &txq->vxtxq_cmd_ring; tx = 0; error = 0; VMXNET3_TXQ_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->vmx_link_active == 0) { if (m != NULL) error = drbr_enqueue(ifp, br, m); return (error); } if (m != NULL) { error = drbr_enqueue(ifp, br, m); if (error) return (error); } while ((avail = VMXNET3_TXRING_AVAIL(txr)) >= 2) { m = drbr_peek(ifp, br); if (m == NULL) break; /* Assume worse case if this mbuf is the head of a chain. */ if (m->m_next != NULL && avail < VMXNET3_TX_MAXSEGS) { drbr_putback(ifp, br, m); break; } if (vmxnet3_txq_encap(txq, &m) != 0) { if (m != NULL) drbr_putback(ifp, br, m); else drbr_advance(ifp, br); break; } drbr_advance(ifp, br); tx++; ETHER_BPF_MTAP(ifp, m); } if (tx > 0) txq->vxtxq_watchdog = VMXNET3_WATCHDOG_TIMEOUT; return (0); } static int vmxnet3_txq_mq_start(struct ifnet *ifp, struct mbuf *m) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; int i, ntxq, error; sc = ifp->if_softc; ntxq = sc->vmx_ntxqueues; - if (m->m_flags & M_FLOWID) + /* check if flowid is set */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % ntxq; else i = curcpu % ntxq; txq = &sc->vmx_txq[i]; if (VMXNET3_TXQ_TRYLOCK(txq) != 0) { error = vmxnet3_txq_mq_start_locked(txq, m); VMXNET3_TXQ_UNLOCK(txq); } else { error = drbr_enqueue(ifp, txq->vxtxq_br, m); taskqueue_enqueue(sc->vmx_tq, &txq->vxtxq_defrtask); } return (error); } static void vmxnet3_txq_tq_deferred(void *xtxq, int pending) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; txq = xtxq; sc = txq->vxtxq_sc; VMXNET3_TXQ_LOCK(txq); if (!drbr_empty(sc->vmx_ifp, txq->vxtxq_br)) vmxnet3_txq_mq_start_locked(txq, NULL); VMXNET3_TXQ_UNLOCK(txq); } #endif /* VMXNET3_LEGACY_TX */ static void vmxnet3_txq_start(struct vmxnet3_txqueue *txq) { struct vmxnet3_softc *sc; struct ifnet *ifp; sc = txq->vxtxq_sc; ifp = sc->vmx_ifp; #ifdef VMXNET3_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) vmxnet3_start_locked(ifp); #else if (!drbr_empty(ifp, txq->vxtxq_br)) vmxnet3_txq_mq_start_locked(txq, NULL); #endif } static void vmxnet3_tx_start_all(struct vmxnet3_softc *sc) { struct vmxnet3_txqueue *txq; int i; VMXNET3_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; VMXNET3_TXQ_LOCK(txq); vmxnet3_txq_start(txq); VMXNET3_TXQ_UNLOCK(txq); } } static void vmxnet3_update_vlan_filter(struct vmxnet3_softc *sc, int add, uint16_t tag) { struct ifnet *ifp; int idx, bit; ifp = sc->vmx_ifp; idx = (tag >> 5) & 0x7F; bit = tag & 0x1F; if (tag == 0 || tag > 4095) return; VMXNET3_CORE_LOCK(sc); /* Update our private VLAN bitvector. */ if (add) sc->vmx_vlan_filter[idx] |= (1 << bit); else sc->vmx_vlan_filter[idx] &= ~(1 << bit); if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { if (add) sc->vmx_ds->vlan_filter[idx] |= (1 << bit); else sc->vmx_ds->vlan_filter[idx] &= ~(1 << bit); vmxnet3_write_cmd(sc, VMXNET3_CMD_VLAN_FILTER); } VMXNET3_CORE_UNLOCK(sc); } static void vmxnet3_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc == arg) vmxnet3_update_vlan_filter(arg, 1, tag); } static void vmxnet3_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag) { if (ifp->if_softc == arg) vmxnet3_update_vlan_filter(arg, 0, tag); } static void vmxnet3_set_rxfilter(struct vmxnet3_softc *sc) { struct ifnet *ifp; struct vmxnet3_driver_shared *ds; struct ifmultiaddr *ifma; u_int mode; ifp = sc->vmx_ifp; ds = sc->vmx_ds; mode = VMXNET3_RXMODE_UCAST | VMXNET3_RXMODE_BCAST; if (ifp->if_flags & IFF_PROMISC) mode |= VMXNET3_RXMODE_PROMISC; if (ifp->if_flags & IFF_ALLMULTI) mode |= VMXNET3_RXMODE_ALLMULTI; else { int cnt = 0, overflow = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; else if (cnt == VMXNET3_MULTICAST_MAX) { overflow = 1; break; } bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &sc->vmx_mcast[cnt*ETHER_ADDR_LEN], ETHER_ADDR_LEN); cnt++; } if_maddr_runlock(ifp); if (overflow != 0) { cnt = 0; mode |= VMXNET3_RXMODE_ALLMULTI; } else if (cnt > 0) mode |= VMXNET3_RXMODE_MCAST; ds->mcast_tablelen = cnt * ETHER_ADDR_LEN; } ds->rxmode = mode; vmxnet3_write_cmd(sc, VMXNET3_CMD_SET_FILTER); vmxnet3_write_cmd(sc, VMXNET3_CMD_SET_RXMODE); } static int vmxnet3_change_mtu(struct vmxnet3_softc *sc, int mtu) { struct ifnet *ifp; ifp = sc->vmx_ifp; if (mtu < VMXNET3_MIN_MTU || mtu > VMXNET3_MAX_MTU) return (EINVAL); ifp->if_mtu = mtu; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } return (0); } static int vmxnet3_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct vmxnet3_softc *sc; struct ifreq *ifr; int reinit, mask, error; sc = ifp->if_softc; ifr = (struct ifreq *) data; error = 0; switch (cmd) { case SIOCSIFMTU: if (ifp->if_mtu != ifr->ifr_mtu) { VMXNET3_CORE_LOCK(sc); error = vmxnet3_change_mtu(sc, ifr->ifr_mtu); VMXNET3_CORE_UNLOCK(sc); } break; case SIOCSIFFLAGS: VMXNET3_CORE_LOCK(sc); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ sc->vmx_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { vmxnet3_set_rxfilter(sc); } } else vmxnet3_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vmxnet3_stop(sc); } sc->vmx_if_flags = ifp->if_flags; VMXNET3_CORE_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: VMXNET3_CORE_LOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) vmxnet3_set_rxfilter(sc); VMXNET3_CORE_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->vmx_media, cmd); break; case SIOCSIFCAP: VMXNET3_CORE_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) ifp->if_capenable ^= IFCAP_TXCSUM; if (mask & IFCAP_TXCSUM_IPV6) ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWFILTER)) { /* Changing these features requires us to reinit. */ reinit = 1; if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWFILTER) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; } else reinit = 0; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } VMXNET3_CORE_UNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, cmd, data); break; } VMXNET3_CORE_LOCK_ASSERT_NOTOWNED(sc); return (error); } #ifndef VMXNET3_LEGACY_TX static void vmxnet3_qflush(struct ifnet *ifp) { struct vmxnet3_softc *sc; struct vmxnet3_txqueue *txq; struct mbuf *m; int i; sc = ifp->if_softc; for (i = 0; i < sc->vmx_ntxqueues; i++) { txq = &sc->vmx_txq[i]; VMXNET3_TXQ_LOCK(txq); while ((m = buf_ring_dequeue_sc(txq->vxtxq_br)) != NULL) m_freem(m); VMXNET3_TXQ_UNLOCK(txq); } if_qflush(ifp); } #endif static int vmxnet3_watchdog(struct vmxnet3_txqueue *txq) { struct vmxnet3_softc *sc; sc = txq->vxtxq_sc; VMXNET3_TXQ_LOCK(txq); if (txq->vxtxq_watchdog == 0 || --txq->vxtxq_watchdog) { VMXNET3_TXQ_UNLOCK(txq); return (0); } VMXNET3_TXQ_UNLOCK(txq); if_printf(sc->vmx_ifp, "watchdog timeout on queue %d\n", txq->vxtxq_id); return (1); } static void vmxnet3_refresh_host_stats(struct vmxnet3_softc *sc) { vmxnet3_write_cmd(sc, VMXNET3_CMD_GET_STATS); } static uint64_t vmxnet3_get_counter(struct ifnet *ifp, ift_counter cnt) { struct vmxnet3_softc *sc; uint64_t rv; sc = if_getsoftc(ifp); rv = 0; /* * With the exception of if_ierrors, these ifnet statistics are * only updated in the driver, so just set them to our accumulated * values. if_ierrors is updated in ether_input() for malformed * frames that we should have already discarded. */ switch (cnt) { case IFCOUNTER_IPACKETS: for (int i = 0; i < sc->vmx_nrxqueues; i++) rv += sc->vmx_rxq[i].vxrxq_stats.vmrxs_ipackets; return (rv); case IFCOUNTER_IQDROPS: for (int i = 0; i < sc->vmx_nrxqueues; i++) rv += sc->vmx_rxq[i].vxrxq_stats.vmrxs_iqdrops; return (rv); case IFCOUNTER_IERRORS: for (int i = 0; i < sc->vmx_nrxqueues; i++) rv += sc->vmx_rxq[i].vxrxq_stats.vmrxs_ierrors; return (rv); case IFCOUNTER_OPACKETS: for (int i = 0; i < sc->vmx_ntxqueues; i++) rv += sc->vmx_txq[i].vxtxq_stats.vmtxs_opackets; return (rv); #ifndef VMXNET3_LEGACY_TX case IFCOUNTER_OBYTES: for (int i = 0; i < sc->vmx_ntxqueues; i++) rv += sc->vmx_txq[i].vxtxq_stats.vmtxs_obytes; return (rv); case IFCOUNTER_OMCASTS: for (int i = 0; i < sc->vmx_ntxqueues; i++) rv += sc->vmx_txq[i].vxtxq_stats.vmtxs_omcasts; return (rv); #endif default: return (if_get_counter_default(ifp, cnt)); } } static void vmxnet3_tick(void *xsc) { struct vmxnet3_softc *sc; struct ifnet *ifp; int i, timedout; sc = xsc; ifp = sc->vmx_ifp; timedout = 0; VMXNET3_CORE_LOCK_ASSERT(sc); vmxnet3_refresh_host_stats(sc); for (i = 0; i < sc->vmx_ntxqueues; i++) timedout |= vmxnet3_watchdog(&sc->vmx_txq[i]); if (timedout != 0) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; vmxnet3_init_locked(sc); } else callout_reset(&sc->vmx_tick, hz, vmxnet3_tick, sc); } static int vmxnet3_link_is_up(struct vmxnet3_softc *sc) { uint32_t status; /* Also update the link speed while here. */ status = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_LINK); sc->vmx_link_speed = status >> 16; return !!(status & 0x1); } static void vmxnet3_link_status(struct vmxnet3_softc *sc) { struct ifnet *ifp; int link; ifp = sc->vmx_ifp; link = vmxnet3_link_is_up(sc); if (link != 0 && sc->vmx_link_active == 0) { sc->vmx_link_active = 1; if_link_state_change(ifp, LINK_STATE_UP); } else if (link == 0 && sc->vmx_link_active != 0) { sc->vmx_link_active = 0; if_link_state_change(ifp, LINK_STATE_DOWN); } } static void vmxnet3_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vmxnet3_softc *sc; sc = ifp->if_softc; ifmr->ifm_active = IFM_ETHER | IFM_AUTO; ifmr->ifm_status = IFM_AVALID; VMXNET3_CORE_LOCK(sc); if (vmxnet3_link_is_up(sc) != 0) ifmr->ifm_status |= IFM_ACTIVE; else ifmr->ifm_status |= IFM_NONE; VMXNET3_CORE_UNLOCK(sc); } static int vmxnet3_media_change(struct ifnet *ifp) { /* Ignore. */ return (0); } static void vmxnet3_set_lladdr(struct vmxnet3_softc *sc) { uint32_t ml, mh; ml = sc->vmx_lladdr[0]; ml |= sc->vmx_lladdr[1] << 8; ml |= sc->vmx_lladdr[2] << 16; ml |= sc->vmx_lladdr[3] << 24; vmxnet3_write_bar1(sc, VMXNET3_BAR1_MACL, ml); mh = sc->vmx_lladdr[4]; mh |= sc->vmx_lladdr[5] << 8; vmxnet3_write_bar1(sc, VMXNET3_BAR1_MACH, mh); } static void vmxnet3_get_lladdr(struct vmxnet3_softc *sc) { uint32_t ml, mh; ml = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_MACL); mh = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_MACH); sc->vmx_lladdr[0] = ml; sc->vmx_lladdr[1] = ml >> 8; sc->vmx_lladdr[2] = ml >> 16; sc->vmx_lladdr[3] = ml >> 24; sc->vmx_lladdr[4] = mh; sc->vmx_lladdr[5] = mh >> 8; } static void vmxnet3_setup_txq_sysctl(struct vmxnet3_txqueue *txq, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { struct sysctl_oid *node, *txsnode; struct sysctl_oid_list *list, *txslist; struct vmxnet3_txq_stats *stats; struct UPT1_TxStats *txstats; char namebuf[16]; stats = &txq->vxtxq_stats; txstats = &txq->vxtxq_ts->stats; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vxtxq_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Transmit Queue"); txq->vxtxq_sysctl = list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD, &stats->vmtxs_opackets, "Transmit packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD, &stats->vmtxs_obytes, "Transmit bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD, &stats->vmtxs_omcasts, "Transmit multicasts"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vmtxs_csum, "Transmit checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD, &stats->vmtxs_tso, "Transmit TCP segmentation offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ringfull", CTLFLAG_RD, &stats->vmtxs_full, "Transmit ring full"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "offload_failed", CTLFLAG_RD, &stats->vmtxs_offload_failed, "Transmit checksum offload failed"); /* * Add statistics reported by the host. These are updated once * per second. */ txsnode = SYSCTL_ADD_NODE(ctx, list, OID_AUTO, "hstats", CTLFLAG_RD, NULL, "Host Statistics"); txslist = SYSCTL_CHILDREN(txsnode); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "tso_packets", CTLFLAG_RD, &txstats->TSO_packets, "TSO packets"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "tso_bytes", CTLFLAG_RD, &txstats->TSO_bytes, "TSO bytes"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "ucast_packets", CTLFLAG_RD, &txstats->ucast_packets, "Unicast packets"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "unicast_bytes", CTLFLAG_RD, &txstats->ucast_bytes, "Unicast bytes"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "mcast_packets", CTLFLAG_RD, &txstats->mcast_packets, "Multicast packets"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "mcast_bytes", CTLFLAG_RD, &txstats->mcast_bytes, "Multicast bytes"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "error", CTLFLAG_RD, &txstats->error, "Errors"); SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "discard", CTLFLAG_RD, &txstats->discard, "Discards"); } static void vmxnet3_setup_rxq_sysctl(struct vmxnet3_rxqueue *rxq, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { struct sysctl_oid *node, *rxsnode; struct sysctl_oid_list *list, *rxslist; struct vmxnet3_rxq_stats *stats; struct UPT1_RxStats *rxstats; char namebuf[16]; stats = &rxq->vxrxq_stats; rxstats = &rxq->vxrxq_rs->stats; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vxrxq_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Receive Queue"); rxq->vxrxq_sysctl = list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD, &stats->vmrxs_ipackets, "Receive packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD, &stats->vmrxs_ibytes, "Receive bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD, &stats->vmrxs_iqdrops, "Receive drops"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD, &stats->vmrxs_ierrors, "Receive errors"); /* * Add statistics reported by the host. These are updated once * per second. */ rxsnode = SYSCTL_ADD_NODE(ctx, list, OID_AUTO, "hstats", CTLFLAG_RD, NULL, "Host Statistics"); rxslist = SYSCTL_CHILDREN(rxsnode); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "lro_packets", CTLFLAG_RD, &rxstats->LRO_packets, "LRO packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "lro_bytes", CTLFLAG_RD, &rxstats->LRO_bytes, "LRO bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "ucast_packets", CTLFLAG_RD, &rxstats->ucast_packets, "Unicast packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "unicast_bytes", CTLFLAG_RD, &rxstats->ucast_bytes, "Unicast bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "mcast_packets", CTLFLAG_RD, &rxstats->mcast_packets, "Multicast packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "mcast_bytes", CTLFLAG_RD, &rxstats->mcast_bytes, "Multicast bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "bcast_packets", CTLFLAG_RD, &rxstats->bcast_packets, "Broadcast packets"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "bcast_bytes", CTLFLAG_RD, &rxstats->bcast_bytes, "Broadcast bytes"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "nobuffer", CTLFLAG_RD, &rxstats->nobuffer, "No buffer"); SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "error", CTLFLAG_RD, &rxstats->error, "Errors"); } static void vmxnet3_setup_debug_sysctl(struct vmxnet3_softc *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { struct sysctl_oid *node; struct sysctl_oid_list *list; int i; for (i = 0; i < sc->vmx_ntxqueues; i++) { struct vmxnet3_txqueue *txq = &sc->vmx_txq[i]; node = SYSCTL_ADD_NODE(ctx, txq->vxtxq_sysctl, OID_AUTO, "debug", CTLFLAG_RD, NULL, ""); list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_head", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_head, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_next", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_next, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_ndesc", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_ndesc, 0, ""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd_gen", CTLFLAG_RD, &txq->vxtxq_cmd_ring.vxtxr_gen, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_next", CTLFLAG_RD, &txq->vxtxq_comp_ring.vxcr_next, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_ndesc", CTLFLAG_RD, &txq->vxtxq_comp_ring.vxcr_ndesc, 0,""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "comp_gen", CTLFLAG_RD, &txq->vxtxq_comp_ring.vxcr_gen, 0, ""); } for (i = 0; i < sc->vmx_nrxqueues; i++) { struct vmxnet3_rxqueue *rxq = &sc->vmx_rxq[i]; node = SYSCTL_ADD_NODE(ctx, rxq->vxrxq_sysctl, OID_AUTO, "debug", CTLFLAG_RD, NULL, ""); list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd0_fill", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[0].vxrxr_fill, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd0_ndesc", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[0].vxrxr_ndesc, 0, ""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd0_gen", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[0].vxrxr_gen, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd1_fill", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[1].vxrxr_fill, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd1_ndesc", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[1].vxrxr_ndesc, 0, ""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd1_gen", CTLFLAG_RD, &rxq->vxrxq_cmd_ring[1].vxrxr_gen, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_next", CTLFLAG_RD, &rxq->vxrxq_comp_ring.vxcr_next, 0, ""); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_ndesc", CTLFLAG_RD, &rxq->vxrxq_comp_ring.vxcr_ndesc, 0,""); SYSCTL_ADD_INT(ctx, list, OID_AUTO, "comp_gen", CTLFLAG_RD, &rxq->vxrxq_comp_ring.vxcr_gen, 0, ""); } } static void vmxnet3_setup_queue_sysctl(struct vmxnet3_softc *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child) { int i; for (i = 0; i < sc->vmx_ntxqueues; i++) vmxnet3_setup_txq_sysctl(&sc->vmx_txq[i], ctx, child); for (i = 0; i < sc->vmx_nrxqueues; i++) vmxnet3_setup_rxq_sysctl(&sc->vmx_rxq[i], ctx, child); vmxnet3_setup_debug_sysctl(sc, ctx, child); } static void vmxnet3_setup_sysctl(struct vmxnet3_softc *sc) { device_t dev; struct vmxnet3_statistics *stats; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vmx_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_ntxqueues", CTLFLAG_RD, &sc->vmx_max_ntxqueues, 0, "Maximum number of Tx queues"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_nrxqueues", CTLFLAG_RD, &sc->vmx_max_nrxqueues, 0, "Maximum number of Rx queues"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "ntxqueues", CTLFLAG_RD, &sc->vmx_ntxqueues, 0, "Number of Tx queues"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "nrxqueues", CTLFLAG_RD, &sc->vmx_nrxqueues, 0, "Number of Rx queues"); stats = &sc->vmx_stats; SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "defragged", CTLFLAG_RD, &stats->vmst_defragged, 0, "Tx mbuf chains defragged"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "defrag_failed", CTLFLAG_RD, &stats->vmst_defrag_failed, 0, "Tx mbuf dropped because defrag failed"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "mgetcl_failed", CTLFLAG_RD, &stats->vmst_mgetcl_failed, 0, "mbuf cluster allocation failed"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "mbuf_load_failed", CTLFLAG_RD, &stats->vmst_mbuf_load_failed, 0, "mbuf load segments failed"); vmxnet3_setup_queue_sysctl(sc, ctx, child); } static void vmxnet3_write_bar0(struct vmxnet3_softc *sc, bus_size_t r, uint32_t v) { bus_space_write_4(sc->vmx_iot0, sc->vmx_ioh0, r, v); } static uint32_t vmxnet3_read_bar1(struct vmxnet3_softc *sc, bus_size_t r) { return (bus_space_read_4(sc->vmx_iot1, sc->vmx_ioh1, r)); } static void vmxnet3_write_bar1(struct vmxnet3_softc *sc, bus_size_t r, uint32_t v) { bus_space_write_4(sc->vmx_iot1, sc->vmx_ioh1, r, v); } static void vmxnet3_write_cmd(struct vmxnet3_softc *sc, uint32_t cmd) { vmxnet3_write_bar1(sc, VMXNET3_BAR1_CMD, cmd); } static uint32_t vmxnet3_read_cmd(struct vmxnet3_softc *sc, uint32_t cmd) { vmxnet3_write_cmd(sc, cmd); bus_space_barrier(sc->vmx_iot1, sc->vmx_ioh1, 0, 0, BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE); return (vmxnet3_read_bar1(sc, VMXNET3_BAR1_CMD)); } static void vmxnet3_enable_intr(struct vmxnet3_softc *sc, int irq) { vmxnet3_write_bar0(sc, VMXNET3_BAR0_IMASK(irq), 0); } static void vmxnet3_disable_intr(struct vmxnet3_softc *sc, int irq) { vmxnet3_write_bar0(sc, VMXNET3_BAR0_IMASK(irq), 1); } static void vmxnet3_enable_all_intrs(struct vmxnet3_softc *sc) { int i; sc->vmx_ds->ictrl &= ~VMXNET3_ICTRL_DISABLE_ALL; for (i = 0; i < sc->vmx_nintrs; i++) vmxnet3_enable_intr(sc, i); } static void vmxnet3_disable_all_intrs(struct vmxnet3_softc *sc) { int i; sc->vmx_ds->ictrl |= VMXNET3_ICTRL_DISABLE_ALL; for (i = 0; i < sc->vmx_nintrs; i++) vmxnet3_disable_intr(sc, i); } static void vmxnet3_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *baddr = arg; if (error == 0) *baddr = segs->ds_addr; } static int vmxnet3_dma_malloc(struct vmxnet3_softc *sc, bus_size_t size, bus_size_t align, struct vmxnet3_dma_alloc *dma) { device_t dev; int error; dev = sc->vmx_dev; bzero(dma, sizeof(struct vmxnet3_dma_alloc)); error = bus_dma_tag_create(bus_get_dma_tag(dev), align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &dma->dma_tag); if (error) { device_printf(dev, "bus_dma_tag_create failed: %d\n", error); goto fail; } error = bus_dmamem_alloc(dma->dma_tag, (void **)&dma->dma_vaddr, BUS_DMA_ZERO | BUS_DMA_NOWAIT, &dma->dma_map); if (error) { device_printf(dev, "bus_dmamem_alloc failed: %d\n", error); goto fail; } error = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, vmxnet3_dmamap_cb, &dma->dma_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "bus_dmamap_load failed: %d\n", error); goto fail; } dma->dma_size = size; fail: if (error) vmxnet3_dma_free(sc, dma); return (error); } static void vmxnet3_dma_free(struct vmxnet3_softc *sc, struct vmxnet3_dma_alloc *dma) { if (dma->dma_tag != NULL) { if (dma->dma_paddr != 0) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); } if (dma->dma_vaddr != NULL) { bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); } bus_dma_tag_destroy(dma->dma_tag); } bzero(dma, sizeof(struct vmxnet3_dma_alloc)); } static int vmxnet3_tunable_int(struct vmxnet3_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "hw.vmx.%d.%s", device_get_unit(sc->vmx_dev), knob); TUNABLE_INT_FETCH(path, &def); return (def); } /* * Since this is a purely paravirtualized device, we do not have * to worry about DMA coherency. But at times, we must make sure * both the compiler and CPU do not reorder memory operations. */ static inline void vmxnet3_barrier(struct vmxnet3_softc *sc, vmxnet3_barrier_t type) { switch (type) { case VMXNET3_BARRIER_RD: rmb(); break; case VMXNET3_BARRIER_WR: wmb(); break; case VMXNET3_BARRIER_RDWR: mb(); break; default: panic("%s: bad barrier type %d", __func__, type); } } Index: head/sys/dev/vxge/vxge.c =================================================================== --- head/sys/dev/vxge/vxge.c (revision 275357) +++ head/sys/dev/vxge/vxge.c (revision 275358) @@ -1,4201 +1,4201 @@ /*- * Copyright(c) 2002-2011 Exar Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification are permitted provided the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Exar Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*$FreeBSD$*/ #include static int vxge_pci_bd_no = -1; static u32 vxge_drv_copyright = 0; static u32 vxge_dev_ref_count = 0; static u32 vxge_dev_req_reboot = 0; static int vpath_selector[VXGE_HAL_MAX_VIRTUAL_PATHS] = \ {0, 1, 3, 3, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, 31}; /* * vxge_probe * Probes for x3100 devices */ int vxge_probe(device_t ndev) { int err = ENXIO; u16 pci_bd_no = 0; u16 pci_vendor_id = 0; u16 pci_device_id = 0; char adapter_name[64]; pci_vendor_id = pci_get_vendor(ndev); if (pci_vendor_id != VXGE_PCI_VENDOR_ID) goto _exit0; pci_device_id = pci_get_device(ndev); if (pci_device_id == VXGE_PCI_DEVICE_ID_TITAN_1) { pci_bd_no = (pci_get_bus(ndev) | pci_get_slot(ndev)); snprintf(adapter_name, sizeof(adapter_name), VXGE_ADAPTER_NAME, pci_get_revid(ndev)); device_set_desc_copy(ndev, adapter_name); if (!vxge_drv_copyright) { device_printf(ndev, VXGE_COPYRIGHT); vxge_drv_copyright = 1; } if (vxge_dev_req_reboot == 0) { vxge_pci_bd_no = pci_bd_no; err = BUS_PROBE_DEFAULT; } else { if (pci_bd_no != vxge_pci_bd_no) { vxge_pci_bd_no = pci_bd_no; err = BUS_PROBE_DEFAULT; } } } _exit0: return (err); } /* * vxge_attach * Connects driver to the system if probe was success @ndev handle */ int vxge_attach(device_t ndev) { int err = 0; vxge_dev_t *vdev; vxge_hal_device_t *hldev = NULL; vxge_hal_device_attr_t device_attr; vxge_free_resources_e error_level = VXGE_FREE_NONE; vxge_hal_status_e status = VXGE_HAL_OK; /* Get per-ndev buffer */ vdev = (vxge_dev_t *) device_get_softc(ndev); if (!vdev) goto _exit0; bzero(vdev, sizeof(vxge_dev_t)); vdev->ndev = ndev; strlcpy(vdev->ndev_name, "vxge", sizeof(vdev->ndev_name)); err = vxge_driver_config(vdev); if (err != 0) goto _exit0; /* Initialize HAL driver */ status = vxge_driver_init(vdev); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "Failed to initialize driver\n"); goto _exit0; } /* Enable PCI bus-master */ pci_enable_busmaster(ndev); /* Allocate resources */ err = vxge_alloc_resources(vdev); if (err != 0) { device_printf(vdev->ndev, "resource allocation failed\n"); goto _exit0; } err = vxge_device_hw_info_get(vdev); if (err != 0) { error_level = VXGE_FREE_BAR2; goto _exit0; } /* Get firmware default values for Device Configuration */ vxge_hal_device_config_default_get(vdev->device_config); /* Customize Device Configuration based on User request */ vxge_vpath_config(vdev); /* Allocate ISR resources */ err = vxge_alloc_isr_resources(vdev); if (err != 0) { error_level = VXGE_FREE_ISR_RESOURCE; device_printf(vdev->ndev, "isr resource allocation failed\n"); goto _exit0; } /* HAL attributes */ device_attr.bar0 = (u8 *) vdev->pdev->bar_info[0]; device_attr.bar1 = (u8 *) vdev->pdev->bar_info[1]; device_attr.bar2 = (u8 *) vdev->pdev->bar_info[2]; device_attr.regh0 = (vxge_bus_res_t *) vdev->pdev->reg_map[0]; device_attr.regh1 = (vxge_bus_res_t *) vdev->pdev->reg_map[1]; device_attr.regh2 = (vxge_bus_res_t *) vdev->pdev->reg_map[2]; device_attr.irqh = (pci_irq_h) vdev->config.isr_info[0].irq_handle; device_attr.cfgh = vdev->pdev; device_attr.pdev = vdev->pdev; /* Initialize HAL Device */ status = vxge_hal_device_initialize((vxge_hal_device_h *) &hldev, &device_attr, vdev->device_config); if (status != VXGE_HAL_OK) { error_level = VXGE_FREE_ISR_RESOURCE; device_printf(vdev->ndev, "hal device initialization failed\n"); goto _exit0; } vdev->devh = hldev; vxge_hal_device_private_set(hldev, vdev); if (vdev->is_privilaged) { err = vxge_firmware_verify(vdev); if (err != 0) { vxge_dev_req_reboot = 1; error_level = VXGE_FREE_TERMINATE_DEVICE; goto _exit0; } } /* Allocate memory for vpath */ vdev->vpaths = (vxge_vpath_t *) vxge_mem_alloc(vdev->no_of_vpath * sizeof(vxge_vpath_t)); if (vdev->vpaths == NULL) { error_level = VXGE_FREE_TERMINATE_DEVICE; device_printf(vdev->ndev, "vpath memory allocation failed\n"); goto _exit0; } vdev->no_of_func = 1; if (vdev->is_privilaged) { vxge_hal_func_mode_count(vdev->devh, vdev->config.hw_info.function_mode, &vdev->no_of_func); vxge_bw_priority_config(vdev); } /* Initialize mutexes */ vxge_mutex_init(vdev); /* Initialize Media */ vxge_media_init(vdev); err = vxge_ifp_setup(ndev); if (err != 0) { error_level = VXGE_FREE_MEDIA; device_printf(vdev->ndev, "setting up interface failed\n"); goto _exit0; } err = vxge_isr_setup(vdev); if (err != 0) { error_level = VXGE_FREE_INTERFACE; device_printf(vdev->ndev, "failed to associate interrupt handler with device\n"); goto _exit0; } vxge_device_hw_info_print(vdev); vdev->is_active = TRUE; _exit0: if (error_level) { vxge_free_resources(ndev, error_level); err = ENXIO; } return (err); } /* * vxge_detach * Detaches driver from the Kernel subsystem */ int vxge_detach(device_t ndev) { vxge_dev_t *vdev; vdev = (vxge_dev_t *) device_get_softc(ndev); if (vdev->is_active) { vdev->is_active = FALSE; vxge_stop(vdev); vxge_free_resources(ndev, VXGE_FREE_ALL); } return (0); } /* * vxge_shutdown * To shutdown device before system shutdown */ int vxge_shutdown(device_t ndev) { vxge_dev_t *vdev = (vxge_dev_t *) device_get_softc(ndev); vxge_stop(vdev); return (0); } /* * vxge_init * Initialize the interface */ void vxge_init(void *vdev_ptr) { vxge_dev_t *vdev = (vxge_dev_t *) vdev_ptr; VXGE_DRV_LOCK(vdev); vxge_init_locked(vdev); VXGE_DRV_UNLOCK(vdev); } /* * vxge_init_locked * Initialize the interface */ void vxge_init_locked(vxge_dev_t *vdev) { int i, err = EINVAL; vxge_hal_device_t *hldev = vdev->devh; vxge_hal_status_e status = VXGE_HAL_OK; vxge_hal_vpath_h vpath_handle; ifnet_t ifp = vdev->ifp; /* If device is in running state, initializing is not required */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) goto _exit0; VXGE_DRV_LOCK_ASSERT(vdev); /* Opening vpaths */ err = vxge_vpath_open(vdev); if (err != 0) goto _exit1; if (vdev->config.rth_enable) { status = vxge_rth_config(vdev); if (status != VXGE_HAL_OK) goto _exit1; } for (i = 0; i < vdev->no_of_vpath; i++) { vpath_handle = vxge_vpath_handle_get(vdev, i); if (!vpath_handle) continue; /* check initial mtu before enabling the device */ status = vxge_hal_device_mtu_check(vpath_handle, ifp->if_mtu); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "invalid mtu size %u specified\n", ifp->if_mtu); goto _exit1; } status = vxge_hal_vpath_mtu_set(vpath_handle, ifp->if_mtu); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "setting mtu in device failed\n"); goto _exit1; } } /* Enable HAL device */ status = vxge_hal_device_enable(hldev); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "failed to enable device\n"); goto _exit1; } if (vdev->config.intr_mode == VXGE_HAL_INTR_MODE_MSIX) vxge_msix_enable(vdev); /* Checksum capability */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_TSO; for (i = 0; i < vdev->no_of_vpath; i++) { vpath_handle = vxge_vpath_handle_get(vdev, i); if (!vpath_handle) continue; /* Enabling mcast for all vpath */ vxge_hal_vpath_mcast_enable(vpath_handle); /* Enabling bcast for all vpath */ status = vxge_hal_vpath_bcast_enable(vpath_handle); if (status != VXGE_HAL_OK) device_printf(vdev->ndev, "can't enable bcast on vpath (%d)\n", i); } /* Enable interrupts */ vxge_hal_device_intr_enable(vdev->devh); for (i = 0; i < vdev->no_of_vpath; i++) { vpath_handle = vxge_vpath_handle_get(vdev, i); if (!vpath_handle) continue; bzero(&(vdev->vpaths[i].driver_stats), sizeof(vxge_drv_stats_t)); status = vxge_hal_vpath_enable(vpath_handle); if (status != VXGE_HAL_OK) goto _exit2; } vxge_os_mdelay(1000); /* Device is initialized */ vdev->is_initialized = TRUE; /* Now inform the stack we're ready */ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; ifp->if_drv_flags |= IFF_DRV_RUNNING; goto _exit0; _exit2: vxge_hal_device_intr_disable(vdev->devh); vxge_hal_device_disable(hldev); _exit1: vxge_vpath_close(vdev); _exit0: return; } /* * vxge_driver_init * Initializes HAL driver */ vxge_hal_status_e vxge_driver_init(vxge_dev_t *vdev) { vxge_hal_uld_cbs_t uld_callbacks; vxge_hal_driver_config_t driver_config; vxge_hal_status_e status = VXGE_HAL_OK; /* Initialize HAL driver */ if (!vxge_dev_ref_count) { bzero(&uld_callbacks, sizeof(vxge_hal_uld_cbs_t)); bzero(&driver_config, sizeof(vxge_hal_driver_config_t)); uld_callbacks.link_up = vxge_link_up; uld_callbacks.link_down = vxge_link_down; uld_callbacks.crit_err = vxge_crit_error; uld_callbacks.sched_timer = NULL; uld_callbacks.xpak_alarm_log = NULL; status = vxge_hal_driver_initialize(&driver_config, &uld_callbacks); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "failed to initialize driver\n"); goto _exit0; } } vxge_hal_driver_debug_set(VXGE_TRACE); vxge_dev_ref_count++; _exit0: return (status); } /* * vxge_driver_config */ int vxge_driver_config(vxge_dev_t *vdev) { int i, err = 0; char temp_buffer[30]; vxge_bw_info_t bw_info; VXGE_GET_PARAM("hint.vxge.0.no_of_vpath", vdev->config, no_of_vpath, VXGE_DEFAULT_USER_HARDCODED); if (vdev->config.no_of_vpath == VXGE_DEFAULT_USER_HARDCODED) vdev->config.no_of_vpath = mp_ncpus; if (vdev->config.no_of_vpath <= 0) { err = EINVAL; device_printf(vdev->ndev, "Failed to load driver, \ invalid config : \'no_of_vpath\'\n"); goto _exit0; } VXGE_GET_PARAM("hint.vxge.0.intr_coalesce", vdev->config, intr_coalesce, VXGE_DEFAULT_CONFIG_DISABLE); VXGE_GET_PARAM("hint.vxge.0.rth_enable", vdev->config, rth_enable, VXGE_DEFAULT_CONFIG_ENABLE); VXGE_GET_PARAM("hint.vxge.0.rth_bkt_sz", vdev->config, rth_bkt_sz, VXGE_DEFAULT_RTH_BUCKET_SIZE); VXGE_GET_PARAM("hint.vxge.0.lro_enable", vdev->config, lro_enable, VXGE_DEFAULT_CONFIG_ENABLE); VXGE_GET_PARAM("hint.vxge.0.tso_enable", vdev->config, tso_enable, VXGE_DEFAULT_CONFIG_ENABLE); VXGE_GET_PARAM("hint.vxge.0.tx_steering", vdev->config, tx_steering, VXGE_DEFAULT_CONFIG_DISABLE); VXGE_GET_PARAM("hint.vxge.0.msix_enable", vdev->config, intr_mode, VXGE_HAL_INTR_MODE_MSIX); VXGE_GET_PARAM("hint.vxge.0.ifqmaxlen", vdev->config, ifq_maxlen, VXGE_DEFAULT_CONFIG_IFQ_MAXLEN); VXGE_GET_PARAM("hint.vxge.0.port_mode", vdev->config, port_mode, VXGE_DEFAULT_CONFIG_VALUE); if (vdev->config.port_mode == VXGE_DEFAULT_USER_HARDCODED) vdev->config.port_mode = VXGE_DEFAULT_CONFIG_VALUE; VXGE_GET_PARAM("hint.vxge.0.l2_switch", vdev->config, l2_switch, VXGE_DEFAULT_CONFIG_VALUE); if (vdev->config.l2_switch == VXGE_DEFAULT_USER_HARDCODED) vdev->config.l2_switch = VXGE_DEFAULT_CONFIG_VALUE; VXGE_GET_PARAM("hint.vxge.0.fw_upgrade", vdev->config, fw_option, VXGE_FW_UPGRADE_ALL); VXGE_GET_PARAM("hint.vxge.0.low_latency", vdev->config, low_latency, VXGE_DEFAULT_CONFIG_DISABLE); VXGE_GET_PARAM("hint.vxge.0.func_mode", vdev->config, function_mode, VXGE_DEFAULT_CONFIG_VALUE); if (vdev->config.function_mode == VXGE_DEFAULT_USER_HARDCODED) vdev->config.function_mode = VXGE_DEFAULT_CONFIG_VALUE; if (!(is_multi_func(vdev->config.function_mode) || is_single_func(vdev->config.function_mode))) vdev->config.function_mode = VXGE_DEFAULT_CONFIG_VALUE; for (i = 0; i < VXGE_HAL_MAX_FUNCTIONS; i++) { bw_info.func_id = i; sprintf(temp_buffer, "hint.vxge.0.bandwidth_%d", i); VXGE_GET_PARAM(temp_buffer, bw_info, bandwidth, VXGE_DEFAULT_USER_HARDCODED); if (bw_info.bandwidth == VXGE_DEFAULT_USER_HARDCODED) bw_info.bandwidth = VXGE_HAL_VPATH_BW_LIMIT_DEFAULT; sprintf(temp_buffer, "hint.vxge.0.priority_%d", i); VXGE_GET_PARAM(temp_buffer, bw_info, priority, VXGE_DEFAULT_USER_HARDCODED); if (bw_info.priority == VXGE_DEFAULT_USER_HARDCODED) bw_info.priority = VXGE_HAL_VPATH_PRIORITY_DEFAULT; vxge_os_memcpy(&vdev->config.bw_info[i], &bw_info, sizeof(vxge_bw_info_t)); } _exit0: return (err); } /* * vxge_stop */ void vxge_stop(vxge_dev_t *vdev) { VXGE_DRV_LOCK(vdev); vxge_stop_locked(vdev); VXGE_DRV_UNLOCK(vdev); } /* * vxge_stop_locked * Common code for both stop and part of reset. * disables device, interrupts and closes vpaths handle */ void vxge_stop_locked(vxge_dev_t *vdev) { u64 adapter_status = 0; vxge_hal_status_e status; vxge_hal_device_t *hldev = vdev->devh; ifnet_t ifp = vdev->ifp; VXGE_DRV_LOCK_ASSERT(vdev); /* If device is not in "Running" state, return */ if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return; /* Set appropriate flags */ vdev->is_initialized = FALSE; hldev->link_state = VXGE_HAL_LINK_NONE; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); /* Disable interrupts */ vxge_hal_device_intr_disable(hldev); /* Disable HAL device */ status = vxge_hal_device_disable(hldev); if (status != VXGE_HAL_OK) { vxge_hal_device_status(hldev, &adapter_status); device_printf(vdev->ndev, "adapter status: 0x%llx\n", adapter_status); } /* reset vpaths */ vxge_vpath_reset(vdev); vxge_os_mdelay(1000); /* Close Vpaths */ vxge_vpath_close(vdev); } void vxge_send(ifnet_t ifp) { vxge_vpath_t *vpath; vxge_dev_t *vdev = (vxge_dev_t *) ifp->if_softc; vpath = &(vdev->vpaths[0]); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (VXGE_TX_TRYLOCK(vpath)) { vxge_send_locked(ifp, vpath); VXGE_TX_UNLOCK(vpath); } } } static inline void vxge_send_locked(ifnet_t ifp, vxge_vpath_t *vpath) { mbuf_t m_head = NULL; vxge_dev_t *vdev = vpath->vdev; VXGE_TX_LOCK_ASSERT(vpath); if ((!vdev->is_initialized) || ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING)) return; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (vxge_xmit(ifp, vpath, &m_head)) { if (m_head == NULL) break; ifp->if_drv_flags |= IFF_DRV_OACTIVE; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); VXGE_DRV_STATS(vpath, tx_again); break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); } } #if __FreeBSD_version >= 800000 int vxge_mq_send(ifnet_t ifp, mbuf_t m_head) { int i = 0, err = 0; vxge_vpath_t *vpath; vxge_dev_t *vdev = (vxge_dev_t *) ifp->if_softc; if (vdev->config.tx_steering) { i = vxge_vpath_get(vdev, m_head); - } else if ((m_head->m_flags & M_FLOWID) != 0) { + } else if (M_HASHTYPE_GET(m_head) != M_HASHTYPE_NONE) { i = m_head->m_pkthdr.flowid % vdev->no_of_vpath; } vpath = &(vdev->vpaths[i]); if (VXGE_TX_TRYLOCK(vpath)) { err = vxge_mq_send_locked(ifp, vpath, m_head); VXGE_TX_UNLOCK(vpath); } else err = drbr_enqueue(ifp, vpath->br, m_head); return (err); } static inline int vxge_mq_send_locked(ifnet_t ifp, vxge_vpath_t *vpath, mbuf_t m_head) { int err = 0; mbuf_t next = NULL; vxge_dev_t *vdev = vpath->vdev; VXGE_TX_LOCK_ASSERT(vpath); if ((!vdev->is_initialized) || ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING)) { err = drbr_enqueue(ifp, vpath->br, m_head); goto _exit0; } if (m_head == NULL) { next = drbr_dequeue(ifp, vpath->br); } else if (drbr_needs_enqueue(ifp, vpath->br)) { if ((err = drbr_enqueue(ifp, vpath->br, m_head)) != 0) goto _exit0; next = drbr_dequeue(ifp, vpath->br); } else next = m_head; /* Process the queue */ while (next != NULL) { if ((err = vxge_xmit(ifp, vpath, &next)) != 0) { if (next == NULL) break; ifp->if_drv_flags |= IFF_DRV_OACTIVE; err = drbr_enqueue(ifp, vpath->br, next); VXGE_DRV_STATS(vpath, tx_again); break; } if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len); if (next->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; next = drbr_dequeue(ifp, vpath->br); } _exit0: return (err); } void vxge_mq_qflush(ifnet_t ifp) { int i; mbuf_t m_head; vxge_vpath_t *vpath; vxge_dev_t *vdev = (vxge_dev_t *) ifp->if_softc; for (i = 0; i < vdev->no_of_vpath; i++) { vpath = &(vdev->vpaths[i]); if (!vpath->handle) continue; VXGE_TX_LOCK(vpath); while ((m_head = buf_ring_dequeue_sc(vpath->br)) != NULL) vxge_free_packet(m_head); VXGE_TX_UNLOCK(vpath); } if_qflush(ifp); } #endif static inline int vxge_xmit(ifnet_t ifp, vxge_vpath_t *vpath, mbuf_t *m_headp) { int err, num_segs = 0; u32 txdl_avail, dma_index, tagged = 0; dma_addr_t dma_addr; bus_size_t dma_sizes; void *dtr_priv; vxge_txdl_priv_t *txdl_priv; vxge_hal_txdl_h txdlh; vxge_hal_status_e status; vxge_dev_t *vdev = vpath->vdev; VXGE_DRV_STATS(vpath, tx_xmit); txdl_avail = vxge_hal_fifo_free_txdl_count_get(vpath->handle); if (txdl_avail < VXGE_TX_LOW_THRESHOLD) { VXGE_DRV_STATS(vpath, tx_low_dtr_cnt); err = ENOBUFS; goto _exit0; } /* Reserve descriptors */ status = vxge_hal_fifo_txdl_reserve(vpath->handle, &txdlh, &dtr_priv); if (status != VXGE_HAL_OK) { VXGE_DRV_STATS(vpath, tx_reserve_failed); err = ENOBUFS; goto _exit0; } /* Update Tx private structure for this descriptor */ txdl_priv = (vxge_txdl_priv_t *) dtr_priv; /* * Map the packet for DMA. * Returns number of segments through num_segs. */ err = vxge_dma_mbuf_coalesce(vpath->dma_tag_tx, txdl_priv->dma_map, m_headp, txdl_priv->dma_buffers, &num_segs); if (vpath->driver_stats.tx_max_frags < num_segs) vpath->driver_stats.tx_max_frags = num_segs; if (err == ENOMEM) { VXGE_DRV_STATS(vpath, tx_no_dma_setup); vxge_hal_fifo_txdl_free(vpath->handle, txdlh); goto _exit0; } else if (err != 0) { vxge_free_packet(*m_headp); VXGE_DRV_STATS(vpath, tx_no_dma_setup); vxge_hal_fifo_txdl_free(vpath->handle, txdlh); goto _exit0; } txdl_priv->mbuf_pkt = *m_headp; /* Set VLAN tag in descriptor only if this packet has it */ if ((*m_headp)->m_flags & M_VLANTAG) vxge_hal_fifo_txdl_vlan_set(txdlh, (*m_headp)->m_pkthdr.ether_vtag); /* Set descriptor buffer for header and each fragment/segment */ for (dma_index = 0; dma_index < num_segs; dma_index++) { dma_sizes = txdl_priv->dma_buffers[dma_index].ds_len; dma_addr = htole64(txdl_priv->dma_buffers[dma_index].ds_addr); vxge_hal_fifo_txdl_buffer_set(vpath->handle, txdlh, dma_index, dma_addr, dma_sizes); } /* Pre-write Sync of mapping */ bus_dmamap_sync(vpath->dma_tag_tx, txdl_priv->dma_map, BUS_DMASYNC_PREWRITE); if ((*m_headp)->m_pkthdr.csum_flags & CSUM_TSO) { if ((*m_headp)->m_pkthdr.tso_segsz) { VXGE_DRV_STATS(vpath, tx_tso); vxge_hal_fifo_txdl_lso_set(txdlh, VXGE_HAL_FIFO_LSO_FRM_ENCAP_AUTO, (*m_headp)->m_pkthdr.tso_segsz); } } /* Checksum */ if (ifp->if_hwassist > 0) { vxge_hal_fifo_txdl_cksum_set_bits(txdlh, VXGE_HAL_FIFO_TXD_TX_CKO_IPV4_EN | VXGE_HAL_FIFO_TXD_TX_CKO_TCP_EN | VXGE_HAL_FIFO_TXD_TX_CKO_UDP_EN); } if ((vxge_hal_device_check_id(vdev->devh) == VXGE_HAL_CARD_TITAN_1A) && (vdev->hw_fw_version >= VXGE_FW_VERSION(1, 8, 0))) tagged = 1; vxge_hal_fifo_txdl_post(vpath->handle, txdlh, tagged); VXGE_DRV_STATS(vpath, tx_posted); _exit0: return (err); } /* * vxge_tx_replenish * Allocate buffers and set them into descriptors for later use */ /* ARGSUSED */ vxge_hal_status_e vxge_tx_replenish(vxge_hal_vpath_h vpath_handle, vxge_hal_txdl_h txdlh, void *dtr_priv, u32 dtr_index, void *userdata, vxge_hal_reopen_e reopen) { int err = 0; vxge_vpath_t *vpath = (vxge_vpath_t *) userdata; vxge_txdl_priv_t *txdl_priv = (vxge_txdl_priv_t *) dtr_priv; err = bus_dmamap_create(vpath->dma_tag_tx, BUS_DMA_NOWAIT, &txdl_priv->dma_map); return ((err == 0) ? VXGE_HAL_OK : VXGE_HAL_FAIL); } /* * vxge_tx_compl * If the interrupt is due to Tx completion, free the sent buffer */ vxge_hal_status_e vxge_tx_compl(vxge_hal_vpath_h vpath_handle, vxge_hal_txdl_h txdlh, void *dtr_priv, vxge_hal_fifo_tcode_e t_code, void *userdata) { vxge_hal_status_e status = VXGE_HAL_OK; vxge_txdl_priv_t *txdl_priv; vxge_vpath_t *vpath = (vxge_vpath_t *) userdata; vxge_dev_t *vdev = vpath->vdev; ifnet_t ifp = vdev->ifp; VXGE_TX_LOCK(vpath); /* * For each completed descriptor * Get private structure, free buffer, do unmapping, and free descriptor */ do { VXGE_DRV_STATS(vpath, tx_compl); if (t_code != VXGE_HAL_FIFO_T_CODE_OK) { device_printf(vdev->ndev, "tx transfer code %d\n", t_code); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); VXGE_DRV_STATS(vpath, tx_tcode); vxge_hal_fifo_handle_tcode(vpath_handle, txdlh, t_code); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); txdl_priv = (vxge_txdl_priv_t *) dtr_priv; bus_dmamap_unload(vpath->dma_tag_tx, txdl_priv->dma_map); vxge_free_packet(txdl_priv->mbuf_pkt); vxge_hal_fifo_txdl_free(vpath->handle, txdlh); } while (vxge_hal_fifo_txdl_next_completed(vpath_handle, &txdlh, &dtr_priv, &t_code) == VXGE_HAL_OK); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; VXGE_TX_UNLOCK(vpath); return (status); } /* ARGSUSED */ void vxge_tx_term(vxge_hal_vpath_h vpath_handle, vxge_hal_txdl_h txdlh, void *dtr_priv, vxge_hal_txdl_state_e state, void *userdata, vxge_hal_reopen_e reopen) { vxge_vpath_t *vpath = (vxge_vpath_t *) userdata; vxge_txdl_priv_t *txdl_priv = (vxge_txdl_priv_t *) dtr_priv; if (state != VXGE_HAL_TXDL_STATE_POSTED) return; if (txdl_priv != NULL) { bus_dmamap_sync(vpath->dma_tag_tx, txdl_priv->dma_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(vpath->dma_tag_tx, txdl_priv->dma_map); bus_dmamap_destroy(vpath->dma_tag_tx, txdl_priv->dma_map); vxge_free_packet(txdl_priv->mbuf_pkt); } /* Free the descriptor */ vxge_hal_fifo_txdl_free(vpath->handle, txdlh); } /* * vxge_rx_replenish * Allocate buffers and set them into descriptors for later use */ /* ARGSUSED */ vxge_hal_status_e vxge_rx_replenish(vxge_hal_vpath_h vpath_handle, vxge_hal_rxd_h rxdh, void *dtr_priv, u32 dtr_index, void *userdata, vxge_hal_reopen_e reopen) { int err = 0; vxge_hal_status_e status = VXGE_HAL_OK; vxge_vpath_t *vpath = (vxge_vpath_t *) userdata; vxge_rxd_priv_t *rxd_priv = (vxge_rxd_priv_t *) dtr_priv; /* Create DMA map for these descriptors */ err = bus_dmamap_create(vpath->dma_tag_rx, BUS_DMA_NOWAIT, &rxd_priv->dma_map); if (err == 0) { if (vxge_rx_rxd_1b_set(vpath, rxdh, dtr_priv)) { bus_dmamap_destroy(vpath->dma_tag_rx, rxd_priv->dma_map); status = VXGE_HAL_FAIL; } } return (status); } /* * vxge_rx_compl */ vxge_hal_status_e vxge_rx_compl(vxge_hal_vpath_h vpath_handle, vxge_hal_rxd_h rxdh, void *dtr_priv, u8 t_code, void *userdata) { mbuf_t mbuf_up; vxge_rxd_priv_t *rxd_priv; vxge_hal_ring_rxd_info_t ext_info; vxge_hal_status_e status = VXGE_HAL_OK; vxge_vpath_t *vpath = (vxge_vpath_t *) userdata; vxge_dev_t *vdev = vpath->vdev; struct lro_entry *queued = NULL; struct lro_ctrl *lro = &vpath->lro; /* get the interface pointer */ ifnet_t ifp = vdev->ifp; do { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { vxge_hal_ring_rxd_post(vpath_handle, rxdh); status = VXGE_HAL_FAIL; break; } VXGE_DRV_STATS(vpath, rx_compl); rxd_priv = (vxge_rxd_priv_t *) dtr_priv; /* Gets details of mbuf i.e., packet length */ vxge_rx_rxd_1b_get(vpath, rxdh, dtr_priv); /* * Prepare one buffer to send it to upper layer Since upper * layer frees the buffer do not use rxd_priv->mbuf_pkt. * Meanwhile prepare a new buffer, do mapping, use with the * current descriptor and post descriptor back to ring vpath */ mbuf_up = rxd_priv->mbuf_pkt; if (t_code != VXGE_HAL_RING_RXD_T_CODE_OK) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); VXGE_DRV_STATS(vpath, rx_tcode); status = vxge_hal_ring_handle_tcode(vpath_handle, rxdh, t_code); /* * If transfer code is not for unknown protocols and * vxge_hal_device_handle_tcode is NOT returned * VXGE_HAL_OK * drop this packet and increment rx_tcode stats */ if ((status != VXGE_HAL_OK) && (t_code != VXGE_HAL_RING_T_CODE_L3_PKT_ERR)) { vxge_free_packet(mbuf_up); vxge_hal_ring_rxd_post(vpath_handle, rxdh); continue; } } if (vxge_rx_rxd_1b_set(vpath, rxdh, dtr_priv)) { /* * If unable to allocate buffer, post descriptor back * to vpath for future processing of same packet. */ vxge_hal_ring_rxd_post(vpath_handle, rxdh); continue; } /* Get the extended information */ vxge_hal_ring_rxd_1b_info_get(vpath_handle, rxdh, &ext_info); /* post descriptor with newly allocated mbuf back to vpath */ vxge_hal_ring_rxd_post(vpath_handle, rxdh); vpath->rxd_posted++; if (vpath->rxd_posted % VXGE_RXD_REPLENISH_COUNT == 0) vxge_hal_ring_rxd_post_post_db(vpath_handle); /* * Set successfully computed checksums in the mbuf. * Leave the rest to the stack to be reverified. */ vxge_rx_checksum(ext_info, mbuf_up); #if __FreeBSD_version >= 800000 - mbuf_up->m_flags |= M_FLOWID; + M_HASHTYPE_SET(mbuf_up, M_HASHTYPE_OPAQUE); mbuf_up->m_pkthdr.flowid = vpath->vp_index; #endif /* Post-Read sync for buffers */ bus_dmamap_sync(vpath->dma_tag_rx, rxd_priv->dma_map, BUS_DMASYNC_POSTREAD); vxge_rx_input(ifp, mbuf_up, vpath); } while (vxge_hal_ring_rxd_next_completed(vpath_handle, &rxdh, &dtr_priv, &t_code) == VXGE_HAL_OK); /* Flush any outstanding LRO work */ if (vpath->lro_enable && vpath->lro.lro_cnt) { while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } } return (status); } static inline void vxge_rx_input(ifnet_t ifp, mbuf_t mbuf_up, vxge_vpath_t *vpath) { if (vpath->lro_enable && vpath->lro.lro_cnt) { if (tcp_lro_rx(&vpath->lro, mbuf_up, 0) == 0) return; } (*ifp->if_input) (ifp, mbuf_up); } static inline void vxge_rx_checksum(vxge_hal_ring_rxd_info_t ext_info, mbuf_t mbuf_up) { if (!(ext_info.proto & VXGE_HAL_FRAME_PROTO_IP_FRAG) && (ext_info.proto & VXGE_HAL_FRAME_PROTO_TCP_OR_UDP) && ext_info.l3_cksum_valid && ext_info.l4_cksum_valid) { mbuf_up->m_pkthdr.csum_data = htons(0xffff); mbuf_up->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mbuf_up->m_pkthdr.csum_flags |= CSUM_IP_VALID; mbuf_up->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); } else { if (ext_info.vlan) { mbuf_up->m_pkthdr.ether_vtag = ext_info.vlan; mbuf_up->m_flags |= M_VLANTAG; } } } /* * vxge_rx_term During unload terminate and free all descriptors * @vpath_handle Rx vpath Handle @rxdh Rx Descriptor Handle @state Descriptor * State @userdata Per-adapter Data @reopen vpath open/reopen option */ /* ARGSUSED */ void vxge_rx_term(vxge_hal_vpath_h vpath_handle, vxge_hal_rxd_h rxdh, void *dtr_priv, vxge_hal_rxd_state_e state, void *userdata, vxge_hal_reopen_e reopen) { vxge_vpath_t *vpath = (vxge_vpath_t *) userdata; vxge_rxd_priv_t *rxd_priv = (vxge_rxd_priv_t *) dtr_priv; if (state != VXGE_HAL_RXD_STATE_POSTED) return; if (rxd_priv != NULL) { bus_dmamap_sync(vpath->dma_tag_rx, rxd_priv->dma_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(vpath->dma_tag_rx, rxd_priv->dma_map); bus_dmamap_destroy(vpath->dma_tag_rx, rxd_priv->dma_map); vxge_free_packet(rxd_priv->mbuf_pkt); } /* Free the descriptor */ vxge_hal_ring_rxd_free(vpath_handle, rxdh); } /* * vxge_rx_rxd_1b_get * Get descriptors of packet to send up */ void vxge_rx_rxd_1b_get(vxge_vpath_t *vpath, vxge_hal_rxd_h rxdh, void *dtr_priv) { vxge_rxd_priv_t *rxd_priv = (vxge_rxd_priv_t *) dtr_priv; mbuf_t mbuf_up = rxd_priv->mbuf_pkt; /* Retrieve data from completed descriptor */ vxge_hal_ring_rxd_1b_get(vpath->handle, rxdh, &rxd_priv->dma_addr[0], (u32 *) &rxd_priv->dma_sizes[0]); /* Update newly created buffer to be sent up with packet length */ mbuf_up->m_len = rxd_priv->dma_sizes[0]; mbuf_up->m_pkthdr.len = rxd_priv->dma_sizes[0]; mbuf_up->m_next = NULL; } /* * vxge_rx_rxd_1b_set * Allocates new mbufs to be placed into descriptors */ int vxge_rx_rxd_1b_set(vxge_vpath_t *vpath, vxge_hal_rxd_h rxdh, void *dtr_priv) { int num_segs, err = 0; mbuf_t mbuf_pkt; bus_dmamap_t dma_map; bus_dma_segment_t dma_buffers[1]; vxge_rxd_priv_t *rxd_priv = (vxge_rxd_priv_t *) dtr_priv; vxge_dev_t *vdev = vpath->vdev; mbuf_pkt = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, vdev->rx_mbuf_sz); if (!mbuf_pkt) { err = ENOBUFS; VXGE_DRV_STATS(vpath, rx_no_buf); device_printf(vdev->ndev, "out of memory to allocate mbuf\n"); goto _exit0; } /* Update mbuf's length, packet length and receive interface */ mbuf_pkt->m_len = vdev->rx_mbuf_sz; mbuf_pkt->m_pkthdr.len = vdev->rx_mbuf_sz; mbuf_pkt->m_pkthdr.rcvif = vdev->ifp; /* Load DMA map */ err = vxge_dma_mbuf_coalesce(vpath->dma_tag_rx, vpath->extra_dma_map, &mbuf_pkt, dma_buffers, &num_segs); if (err != 0) { VXGE_DRV_STATS(vpath, rx_map_fail); vxge_free_packet(mbuf_pkt); goto _exit0; } /* Unload DMA map of mbuf in current descriptor */ bus_dmamap_sync(vpath->dma_tag_rx, rxd_priv->dma_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(vpath->dma_tag_rx, rxd_priv->dma_map); /* Update descriptor private data */ dma_map = rxd_priv->dma_map; rxd_priv->mbuf_pkt = mbuf_pkt; rxd_priv->dma_addr[0] = htole64(dma_buffers->ds_addr); rxd_priv->dma_map = vpath->extra_dma_map; vpath->extra_dma_map = dma_map; /* Pre-Read/Write sync */ bus_dmamap_sync(vpath->dma_tag_rx, rxd_priv->dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Set descriptor buffer */ vxge_hal_ring_rxd_1b_set(rxdh, rxd_priv->dma_addr[0], vdev->rx_mbuf_sz); _exit0: return (err); } /* * vxge_link_up * Callback for Link-up indication from HAL */ /* ARGSUSED */ void vxge_link_up(vxge_hal_device_h devh, void *userdata) { int i; vxge_vpath_t *vpath; vxge_hal_device_hw_info_t *hw_info; vxge_dev_t *vdev = (vxge_dev_t *) userdata; hw_info = &vdev->config.hw_info; ifnet_t ifp = vdev->ifp; if (vdev->config.intr_mode == VXGE_HAL_INTR_MODE_MSIX) { for (i = 0; i < vdev->no_of_vpath; i++) { vpath = &(vdev->vpaths[i]); vxge_hal_vpath_tti_ci_set(vpath->handle); vxge_hal_vpath_rti_ci_set(vpath->handle); } } if (vdev->is_privilaged && (hw_info->ports > 1)) { vxge_active_port_update(vdev); device_printf(vdev->ndev, "Active Port : %lld\n", vdev->active_port); } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); } /* * vxge_link_down * Callback for Link-down indication from HAL */ /* ARGSUSED */ void vxge_link_down(vxge_hal_device_h devh, void *userdata) { int i; vxge_vpath_t *vpath; vxge_dev_t *vdev = (vxge_dev_t *) userdata; ifnet_t ifp = vdev->ifp; if (vdev->config.intr_mode == VXGE_HAL_INTR_MODE_MSIX) { for (i = 0; i < vdev->no_of_vpath; i++) { vpath = &(vdev->vpaths[i]); vxge_hal_vpath_tti_ci_reset(vpath->handle); vxge_hal_vpath_rti_ci_reset(vpath->handle); } } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_DOWN); } /* * vxge_reset */ void vxge_reset(vxge_dev_t *vdev) { if (!vdev->is_initialized) return; VXGE_DRV_LOCK(vdev); vxge_stop_locked(vdev); vxge_init_locked(vdev); VXGE_DRV_UNLOCK(vdev); } /* * vxge_crit_error * Callback for Critical error indication from HAL */ /* ARGSUSED */ void vxge_crit_error(vxge_hal_device_h devh, void *userdata, vxge_hal_event_e type, u64 serr_data) { vxge_dev_t *vdev = (vxge_dev_t *) userdata; ifnet_t ifp = vdev->ifp; switch (type) { case VXGE_HAL_EVENT_SERR: case VXGE_HAL_EVENT_KDFCCTL: case VXGE_HAL_EVENT_CRITICAL: vxge_hal_device_intr_disable(vdev->devh); ifp->if_drv_flags |= IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_DOWN); break; default: break; } } /* * vxge_ifp_setup */ int vxge_ifp_setup(device_t ndev) { ifnet_t ifp; int i, j, err = 0; vxge_dev_t *vdev = (vxge_dev_t *) device_get_softc(ndev); for (i = 0, j = 0; i < VXGE_HAL_MAX_VIRTUAL_PATHS; i++) { if (!bVAL1(vdev->config.hw_info.vpath_mask, i)) continue; if (j >= vdev->no_of_vpath) break; vdev->vpaths[j].vp_id = i; vdev->vpaths[j].vp_index = j; vdev->vpaths[j].vdev = vdev; vdev->vpaths[j].is_configured = TRUE; vxge_os_memcpy((u8 *) vdev->vpaths[j].mac_addr, (u8 *) (vdev->config.hw_info.mac_addrs[i]), (size_t) ETHER_ADDR_LEN); j++; } /* Get interface ifnet structure for this Ether device */ ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(vdev->ndev, "memory allocation for ifnet failed\n"); err = ENXIO; goto _exit0; } vdev->ifp = ifp; /* Initialize interface ifnet structure */ if_initname(ifp, device_get_name(ndev), device_get_unit(ndev)); ifp->if_baudrate = VXGE_BAUDRATE; ifp->if_init = vxge_init; ifp->if_softc = vdev; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = vxge_ioctl; ifp->if_start = vxge_send; #if __FreeBSD_version >= 800000 ifp->if_transmit = vxge_mq_send; ifp->if_qflush = vxge_mq_qflush; #endif ifp->if_snd.ifq_drv_maxlen = max(vdev->config.ifq_maxlen, ifqmaxlen); IFQ_SET_MAXLEN(&ifp->if_snd, ifp->if_snd.ifq_drv_maxlen); /* IFQ_SET_READY(&ifp->if_snd); */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; ifp->if_capabilities |= IFCAP_JUMBO_MTU; if (vdev->config.tso_enable) vxge_tso_config(vdev); if (vdev->config.lro_enable) ifp->if_capabilities |= IFCAP_LRO; ifp->if_capenable = ifp->if_capabilities; strlcpy(vdev->ndev_name, device_get_nameunit(ndev), sizeof(vdev->ndev_name)); /* Attach the interface */ ether_ifattach(ifp, vdev->vpaths[0].mac_addr); _exit0: return (err); } /* * vxge_isr_setup * Register isr functions */ int vxge_isr_setup(vxge_dev_t *vdev) { int i, irq_rid, err = 0; vxge_vpath_t *vpath; void *isr_func_arg; void (*isr_func_ptr) (void *); switch (vdev->config.intr_mode) { case VXGE_HAL_INTR_MODE_IRQLINE: err = bus_setup_intr(vdev->ndev, vdev->config.isr_info[0].irq_res, (INTR_TYPE_NET | INTR_MPSAFE), vxge_isr_filter, vxge_isr_line, vdev, &vdev->config.isr_info[0].irq_handle); break; case VXGE_HAL_INTR_MODE_MSIX: for (i = 0; i < vdev->intr_count; i++) { irq_rid = vdev->config.isr_info[i].irq_rid; vpath = &vdev->vpaths[irq_rid / 4]; if ((irq_rid % 4) == 2) { isr_func_ptr = vxge_isr_msix; isr_func_arg = (void *) vpath; } else if ((irq_rid % 4) == 3) { isr_func_ptr = vxge_isr_msix_alarm; isr_func_arg = (void *) vpath; } else break; err = bus_setup_intr(vdev->ndev, vdev->config.isr_info[i].irq_res, (INTR_TYPE_NET | INTR_MPSAFE), NULL, (void *) isr_func_ptr, (void *) isr_func_arg, &vdev->config.isr_info[i].irq_handle); if (err != 0) break; } if (err != 0) { /* Teardown interrupt handler */ while (--i > 0) bus_teardown_intr(vdev->ndev, vdev->config.isr_info[i].irq_res, vdev->config.isr_info[i].irq_handle); } break; } return (err); } /* * vxge_isr_filter * ISR filter function - filter interrupts from other shared devices */ int vxge_isr_filter(void *handle) { u64 val64 = 0; vxge_dev_t *vdev = (vxge_dev_t *) handle; __hal_device_t *hldev = (__hal_device_t *) vdev->devh; vxge_hal_common_reg_t *common_reg = (vxge_hal_common_reg_t *) (hldev->common_reg); val64 = vxge_os_pio_mem_read64(vdev->pdev, (vdev->devh)->regh0, &common_reg->titan_general_int_status); return ((val64) ? FILTER_SCHEDULE_THREAD : FILTER_STRAY); } /* * vxge_isr_line * Interrupt service routine for Line interrupts */ void vxge_isr_line(void *vdev_ptr) { vxge_dev_t *vdev = (vxge_dev_t *) vdev_ptr; vxge_hal_device_handle_irq(vdev->devh, 0); } void vxge_isr_msix(void *vpath_ptr) { u32 got_rx = 0; u32 got_tx = 0; __hal_virtualpath_t *hal_vpath; vxge_vpath_t *vpath = (vxge_vpath_t *) vpath_ptr; vxge_dev_t *vdev = vpath->vdev; hal_vpath = ((__hal_vpath_handle_t *) vpath->handle)->vpath; VXGE_DRV_STATS(vpath, isr_msix); VXGE_HAL_DEVICE_STATS_SW_INFO_TRAFFIC_INTR(vdev->devh); vxge_hal_vpath_mf_msix_mask(vpath->handle, vpath->msix_vec); /* processing rx */ vxge_hal_vpath_poll_rx(vpath->handle, &got_rx); /* processing tx */ if (hal_vpath->vp_config->fifo.enable) { vxge_intr_coalesce_tx(vpath); vxge_hal_vpath_poll_tx(vpath->handle, &got_tx); } vxge_hal_vpath_mf_msix_unmask(vpath->handle, vpath->msix_vec); } void vxge_isr_msix_alarm(void *vpath_ptr) { int i; vxge_hal_status_e status = VXGE_HAL_OK; vxge_vpath_t *vpath = (vxge_vpath_t *) vpath_ptr; vxge_dev_t *vdev = vpath->vdev; VXGE_HAL_DEVICE_STATS_SW_INFO_NOT_TRAFFIC_INTR(vdev->devh); /* Process alarms in each vpath */ for (i = 0; i < vdev->no_of_vpath; i++) { vpath = &(vdev->vpaths[i]); vxge_hal_vpath_mf_msix_mask(vpath->handle, vpath->msix_vec_alarm); status = vxge_hal_vpath_alarm_process(vpath->handle, 0); if ((status == VXGE_HAL_ERR_EVENT_SLOT_FREEZE) || (status == VXGE_HAL_ERR_EVENT_SERR)) { device_printf(vdev->ndev, "processing alarms urecoverable error %x\n", status); /* Stop the driver */ vdev->is_initialized = FALSE; break; } vxge_hal_vpath_mf_msix_unmask(vpath->handle, vpath->msix_vec_alarm); } } /* * vxge_msix_enable */ vxge_hal_status_e vxge_msix_enable(vxge_dev_t *vdev) { int i, first_vp_id, msix_id; vxge_vpath_t *vpath; vxge_hal_status_e status = VXGE_HAL_OK; /* * Unmasking and Setting MSIX vectors before enabling interrupts * tim[] : 0 - Tx ## 1 - Rx ## 2 - UMQ-DMQ ## 0 - BITMAP */ int tim[4] = {0, 1, 0, 0}; for (i = 0; i < vdev->no_of_vpath; i++) { vpath = vdev->vpaths + i; first_vp_id = vdev->vpaths[0].vp_id; msix_id = vpath->vp_id * VXGE_HAL_VPATH_MSIX_ACTIVE; tim[1] = vpath->msix_vec = msix_id + 1; vpath->msix_vec_alarm = first_vp_id * VXGE_HAL_VPATH_MSIX_ACTIVE + VXGE_HAL_VPATH_MSIX_ALARM_ID; status = vxge_hal_vpath_mf_msix_set(vpath->handle, tim, VXGE_HAL_VPATH_MSIX_ALARM_ID); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "failed to set msix vectors to vpath\n"); break; } vxge_hal_vpath_mf_msix_unmask(vpath->handle, vpath->msix_vec); vxge_hal_vpath_mf_msix_unmask(vpath->handle, vpath->msix_vec_alarm); } return (status); } /* * vxge_media_init * Initializes, adds and sets media */ void vxge_media_init(vxge_dev_t *vdev) { ifmedia_init(&vdev->media, IFM_IMASK, vxge_media_change, vxge_media_status); /* Add supported media */ ifmedia_add(&vdev->media, IFM_ETHER | vdev->ifm_optics | IFM_FDX, 0, NULL); /* Set media */ ifmedia_add(&vdev->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&vdev->media, IFM_ETHER | IFM_AUTO); } /* * vxge_media_status * Callback for interface media settings */ void vxge_media_status(ifnet_t ifp, struct ifmediareq *ifmr) { vxge_dev_t *vdev = (vxge_dev_t *) ifp->if_softc; vxge_hal_device_t *hldev = vdev->devh; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; /* set link state */ if (vxge_hal_device_link_state_get(hldev) == VXGE_HAL_LINK_UP) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= vdev->ifm_optics | IFM_FDX; if_link_state_change(ifp, LINK_STATE_UP); } } /* * vxge_media_change * Media change driver callback */ int vxge_media_change(ifnet_t ifp) { vxge_dev_t *vdev = (vxge_dev_t *) ifp->if_softc; struct ifmedia *ifmediap = &vdev->media; return (IFM_TYPE(ifmediap->ifm_media) != IFM_ETHER ? EINVAL : 0); } /* * Allocate PCI resources */ int vxge_alloc_resources(vxge_dev_t *vdev) { int err = 0; vxge_pci_info_t *pci_info = NULL; vxge_free_resources_e error_level = VXGE_FREE_NONE; device_t ndev = vdev->ndev; /* Allocate Buffer for HAL Device Configuration */ vdev->device_config = (vxge_hal_device_config_t *) vxge_mem_alloc(sizeof(vxge_hal_device_config_t)); if (!vdev->device_config) { err = ENOMEM; error_level = VXGE_DISABLE_PCI_BUSMASTER; device_printf(vdev->ndev, "failed to allocate memory for device config\n"); goto _exit0; } pci_info = (vxge_pci_info_t *) vxge_mem_alloc(sizeof(vxge_pci_info_t)); if (!pci_info) { error_level = VXGE_FREE_DEVICE_CONFIG; err = ENOMEM; device_printf(vdev->ndev, "failed to allocate memory for pci info\n"); goto _exit0; } pci_info->ndev = ndev; vdev->pdev = pci_info; err = vxge_alloc_bar_resources(vdev, 0); if (err != 0) { error_level = VXGE_FREE_BAR0; goto _exit0; } err = vxge_alloc_bar_resources(vdev, 1); if (err != 0) { error_level = VXGE_FREE_BAR1; goto _exit0; } err = vxge_alloc_bar_resources(vdev, 2); if (err != 0) error_level = VXGE_FREE_BAR2; _exit0: if (error_level) vxge_free_resources(ndev, error_level); return (err); } /* * vxge_alloc_bar_resources * Allocates BAR resources */ int vxge_alloc_bar_resources(vxge_dev_t *vdev, int i) { int err = 0; int res_id = 0; vxge_pci_info_t *pci_info = vdev->pdev; res_id = PCIR_BAR((i == 0) ? 0 : (i * 2)); pci_info->bar_info[i] = bus_alloc_resource_any(vdev->ndev, SYS_RES_MEMORY, &res_id, RF_ACTIVE); if (pci_info->bar_info[i] == NULL) { device_printf(vdev->ndev, "failed to allocate memory for bus resources\n"); err = ENOMEM; goto _exit0; } pci_info->reg_map[i] = (vxge_bus_res_t *) vxge_mem_alloc(sizeof(vxge_bus_res_t)); if (pci_info->reg_map[i] == NULL) { device_printf(vdev->ndev, "failed to allocate memory bar resources\n"); err = ENOMEM; goto _exit0; } ((vxge_bus_res_t *) (pci_info->reg_map[i]))->bus_space_tag = rman_get_bustag(pci_info->bar_info[i]); ((vxge_bus_res_t *) (pci_info->reg_map[i]))->bus_space_handle = rman_get_bushandle(pci_info->bar_info[i]); ((vxge_bus_res_t *) (pci_info->reg_map[i]))->bar_start_addr = pci_info->bar_info[i]; ((vxge_bus_res_t *) (pci_info->reg_map[i]))->bus_res_len = rman_get_size(pci_info->bar_info[i]); _exit0: return (err); } /* * vxge_alloc_isr_resources */ int vxge_alloc_isr_resources(vxge_dev_t *vdev) { int i, err = 0, irq_rid; int msix_vec_reqd, intr_count, msix_count; int intr_mode = VXGE_HAL_INTR_MODE_IRQLINE; if (vdev->config.intr_mode == VXGE_HAL_INTR_MODE_MSIX) { /* MSI-X messages supported by device */ intr_count = pci_msix_count(vdev->ndev); if (intr_count) { msix_vec_reqd = 4 * vdev->no_of_vpath; if (intr_count >= msix_vec_reqd) { intr_count = msix_vec_reqd; err = pci_alloc_msix(vdev->ndev, &intr_count); if (err == 0) intr_mode = VXGE_HAL_INTR_MODE_MSIX; } if ((err != 0) || (intr_count < msix_vec_reqd)) { device_printf(vdev->ndev, "Unable to allocate " "msi/x vectors switching to INTA mode\n"); } } } err = 0; vdev->intr_count = 0; vdev->config.intr_mode = intr_mode; switch (vdev->config.intr_mode) { case VXGE_HAL_INTR_MODE_IRQLINE: vdev->config.isr_info[0].irq_rid = 0; vdev->config.isr_info[0].irq_res = bus_alloc_resource_any(vdev->ndev, SYS_RES_IRQ, &vdev->config.isr_info[0].irq_rid, (RF_SHAREABLE | RF_ACTIVE)); if (vdev->config.isr_info[0].irq_res == NULL) { device_printf(vdev->ndev, "failed to allocate line interrupt resource\n"); err = ENOMEM; goto _exit0; } vdev->intr_count++; break; case VXGE_HAL_INTR_MODE_MSIX: msix_count = 0; for (i = 0; i < vdev->no_of_vpath; i++) { irq_rid = i * 4; vdev->config.isr_info[msix_count].irq_rid = irq_rid + 2; vdev->config.isr_info[msix_count].irq_res = bus_alloc_resource_any(vdev->ndev, SYS_RES_IRQ, &vdev->config.isr_info[msix_count].irq_rid, (RF_SHAREABLE | RF_ACTIVE)); if (vdev->config.isr_info[msix_count].irq_res == NULL) { device_printf(vdev->ndev, "allocating bus resource (rid %d) failed\n", vdev->config.isr_info[msix_count].irq_rid); err = ENOMEM; goto _exit0; } vdev->intr_count++; err = bus_bind_intr(vdev->ndev, vdev->config.isr_info[msix_count].irq_res, (i % mp_ncpus)); if (err != 0) break; msix_count++; } vdev->config.isr_info[msix_count].irq_rid = 3; vdev->config.isr_info[msix_count].irq_res = bus_alloc_resource_any(vdev->ndev, SYS_RES_IRQ, &vdev->config.isr_info[msix_count].irq_rid, (RF_SHAREABLE | RF_ACTIVE)); if (vdev->config.isr_info[msix_count].irq_res == NULL) { device_printf(vdev->ndev, "allocating bus resource (rid %d) failed\n", vdev->config.isr_info[msix_count].irq_rid); err = ENOMEM; goto _exit0; } vdev->intr_count++; err = bus_bind_intr(vdev->ndev, vdev->config.isr_info[msix_count].irq_res, (i % mp_ncpus)); break; } vdev->device_config->intr_mode = vdev->config.intr_mode; _exit0: return (err); } /* * vxge_free_resources * Undo what-all we did during load/attach */ void vxge_free_resources(device_t ndev, vxge_free_resources_e vxge_free_resource) { int i; vxge_dev_t *vdev; vdev = (vxge_dev_t *) device_get_softc(ndev); switch (vxge_free_resource) { case VXGE_FREE_ALL: for (i = 0; i < vdev->intr_count; i++) { bus_teardown_intr(ndev, vdev->config.isr_info[i].irq_res, vdev->config.isr_info[i].irq_handle); } /* FALLTHROUGH */ case VXGE_FREE_INTERFACE: ether_ifdetach(vdev->ifp); bus_generic_detach(ndev); if_free(vdev->ifp); /* FALLTHROUGH */ case VXGE_FREE_MEDIA: ifmedia_removeall(&vdev->media); /* FALLTHROUGH */ case VXGE_FREE_MUTEX: vxge_mutex_destroy(vdev); /* FALLTHROUGH */ case VXGE_FREE_VPATH: vxge_mem_free(vdev->vpaths, vdev->no_of_vpath * sizeof(vxge_vpath_t)); /* FALLTHROUGH */ case VXGE_FREE_TERMINATE_DEVICE: if (vdev->devh != NULL) { vxge_hal_device_private_set(vdev->devh, 0); vxge_hal_device_terminate(vdev->devh); } /* FALLTHROUGH */ case VXGE_FREE_ISR_RESOURCE: vxge_free_isr_resources(vdev); /* FALLTHROUGH */ case VXGE_FREE_BAR2: vxge_free_bar_resources(vdev, 2); /* FALLTHROUGH */ case VXGE_FREE_BAR1: vxge_free_bar_resources(vdev, 1); /* FALLTHROUGH */ case VXGE_FREE_BAR0: vxge_free_bar_resources(vdev, 0); /* FALLTHROUGH */ case VXGE_FREE_PCI_INFO: vxge_mem_free(vdev->pdev, sizeof(vxge_pci_info_t)); /* FALLTHROUGH */ case VXGE_FREE_DEVICE_CONFIG: vxge_mem_free(vdev->device_config, sizeof(vxge_hal_device_config_t)); /* FALLTHROUGH */ case VXGE_DISABLE_PCI_BUSMASTER: pci_disable_busmaster(ndev); /* FALLTHROUGH */ case VXGE_FREE_TERMINATE_DRIVER: if (vxge_dev_ref_count) { --vxge_dev_ref_count; if (0 == vxge_dev_ref_count) vxge_hal_driver_terminate(); } /* FALLTHROUGH */ default: case VXGE_FREE_NONE: break; /* NOTREACHED */ } } void vxge_free_isr_resources(vxge_dev_t *vdev) { int i; switch (vdev->config.intr_mode) { case VXGE_HAL_INTR_MODE_IRQLINE: if (vdev->config.isr_info[0].irq_res) { bus_release_resource(vdev->ndev, SYS_RES_IRQ, vdev->config.isr_info[0].irq_rid, vdev->config.isr_info[0].irq_res); vdev->config.isr_info[0].irq_res = NULL; } break; case VXGE_HAL_INTR_MODE_MSIX: for (i = 0; i < vdev->intr_count; i++) { if (vdev->config.isr_info[i].irq_res) { bus_release_resource(vdev->ndev, SYS_RES_IRQ, vdev->config.isr_info[i].irq_rid, vdev->config.isr_info[i].irq_res); vdev->config.isr_info[i].irq_res = NULL; } } if (vdev->intr_count) pci_release_msi(vdev->ndev); break; } } void vxge_free_bar_resources(vxge_dev_t *vdev, int i) { int res_id = 0; vxge_pci_info_t *pci_info = vdev->pdev; res_id = PCIR_BAR((i == 0) ? 0 : (i * 2)); if (pci_info->bar_info[i]) bus_release_resource(vdev->ndev, SYS_RES_MEMORY, res_id, pci_info->bar_info[i]); vxge_mem_free(pci_info->reg_map[i], sizeof(vxge_bus_res_t)); } /* * vxge_init_mutex * Initializes mutexes used in driver */ void vxge_mutex_init(vxge_dev_t *vdev) { int i; snprintf(vdev->mtx_drv_name, sizeof(vdev->mtx_drv_name), "%s_drv", vdev->ndev_name); mtx_init(&vdev->mtx_drv, vdev->mtx_drv_name, MTX_NETWORK_LOCK, MTX_DEF); for (i = 0; i < vdev->no_of_vpath; i++) { snprintf(vdev->vpaths[i].mtx_tx_name, sizeof(vdev->vpaths[i].mtx_tx_name), "%s_tx_%d", vdev->ndev_name, i); mtx_init(&vdev->vpaths[i].mtx_tx, vdev->vpaths[i].mtx_tx_name, NULL, MTX_DEF); } } /* * vxge_mutex_destroy * Destroys mutexes used in driver */ void vxge_mutex_destroy(vxge_dev_t *vdev) { int i; for (i = 0; i < vdev->no_of_vpath; i++) VXGE_TX_LOCK_DESTROY(&(vdev->vpaths[i])); VXGE_DRV_LOCK_DESTROY(vdev); } /* * vxge_rth_config */ vxge_hal_status_e vxge_rth_config(vxge_dev_t *vdev) { int i; vxge_hal_vpath_h vpath_handle; vxge_hal_rth_hash_types_t hash_types; vxge_hal_status_e status = VXGE_HAL_OK; u8 mtable[256] = {0}; /* Filling matable with bucket-to-vpath mapping */ vdev->config.rth_bkt_sz = VXGE_DEFAULT_RTH_BUCKET_SIZE; for (i = 0; i < (1 << vdev->config.rth_bkt_sz); i++) mtable[i] = i % vdev->no_of_vpath; /* Fill RTH hash types */ hash_types.hash_type_tcpipv4_en = VXGE_HAL_RING_HASH_TYPE_TCP_IPV4; hash_types.hash_type_tcpipv6_en = VXGE_HAL_RING_HASH_TYPE_TCP_IPV6; hash_types.hash_type_tcpipv6ex_en = VXGE_HAL_RING_HASH_TYPE_TCP_IPV6_EX; hash_types.hash_type_ipv4_en = VXGE_HAL_RING_HASH_TYPE_IPV4; hash_types.hash_type_ipv6_en = VXGE_HAL_RING_HASH_TYPE_IPV6; hash_types.hash_type_ipv6ex_en = VXGE_HAL_RING_HASH_TYPE_IPV6_EX; /* set indirection table, bucket-to-vpath mapping */ status = vxge_hal_vpath_rts_rth_itable_set(vdev->vpath_handles, vdev->no_of_vpath, mtable, ((u32) (1 << vdev->config.rth_bkt_sz))); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "rth configuration failed\n"); goto _exit0; } for (i = 0; i < vdev->no_of_vpath; i++) { vpath_handle = vxge_vpath_handle_get(vdev, i); if (!vpath_handle) continue; status = vxge_hal_vpath_rts_rth_set(vpath_handle, RTH_ALG_JENKINS, &hash_types, vdev->config.rth_bkt_sz, TRUE); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "rth configuration failed for vpath (%d)\n", vdev->vpaths[i].vp_id); break; } } _exit0: return (status); } /* * vxge_vpath_config * Sets HAL parameter values from kenv */ void vxge_vpath_config(vxge_dev_t *vdev) { int i; u32 no_of_vpath = 0; vxge_hal_vp_config_t *vp_config; vxge_hal_device_config_t *device_config = vdev->device_config; device_config->debug_level = VXGE_TRACE; device_config->debug_mask = VXGE_COMPONENT_ALL; device_config->device_poll_millis = VXGE_DEFAULT_DEVICE_POLL_MILLIS; vdev->config.no_of_vpath = min(vdev->config.no_of_vpath, vdev->max_supported_vpath); for (i = 0; i < VXGE_HAL_MAX_VIRTUAL_PATHS; i++) { vp_config = &(device_config->vp_config[i]); vp_config->fifo.enable = VXGE_HAL_FIFO_DISABLE; vp_config->ring.enable = VXGE_HAL_RING_DISABLE; } for (i = 0; i < VXGE_HAL_MAX_VIRTUAL_PATHS; i++) { if (no_of_vpath >= vdev->config.no_of_vpath) break; if (!bVAL1(vdev->config.hw_info.vpath_mask, i)) continue; no_of_vpath++; vp_config = &(device_config->vp_config[i]); vp_config->mtu = VXGE_HAL_DEFAULT_MTU; vp_config->ring.enable = VXGE_HAL_RING_ENABLE; vp_config->ring.post_mode = VXGE_HAL_RING_POST_MODE_DOORBELL; vp_config->ring.buffer_mode = VXGE_HAL_RING_RXD_BUFFER_MODE_1; vp_config->ring.ring_length = vxge_ring_length_get(VXGE_HAL_RING_RXD_BUFFER_MODE_1); vp_config->ring.scatter_mode = VXGE_HAL_RING_SCATTER_MODE_A; vp_config->rpa_all_vid_en = VXGE_DEFAULT_ALL_VID_ENABLE; vp_config->rpa_strip_vlan_tag = VXGE_DEFAULT_STRIP_VLAN_TAG; vp_config->rpa_ucast_all_addr_en = VXGE_HAL_VPATH_RPA_UCAST_ALL_ADDR_DISABLE; vp_config->rti.intr_enable = VXGE_HAL_TIM_INTR_ENABLE; vp_config->rti.txfrm_cnt_en = VXGE_HAL_TXFRM_CNT_EN_ENABLE; vp_config->rti.util_sel = VXGE_HAL_TIM_UTIL_SEL_LEGACY_RX_NET_UTIL; vp_config->rti.uec_a = VXGE_DEFAULT_RTI_RX_UFC_A; vp_config->rti.uec_b = VXGE_DEFAULT_RTI_RX_UFC_B; vp_config->rti.uec_c = VXGE_DEFAULT_RTI_RX_UFC_C; vp_config->rti.uec_d = VXGE_DEFAULT_RTI_RX_UFC_D; vp_config->rti.urange_a = VXGE_DEFAULT_RTI_RX_URANGE_A; vp_config->rti.urange_b = VXGE_DEFAULT_RTI_RX_URANGE_B; vp_config->rti.urange_c = VXGE_DEFAULT_RTI_RX_URANGE_C; vp_config->rti.timer_ac_en = VXGE_HAL_TIM_TIMER_AC_ENABLE; vp_config->rti.timer_ci_en = VXGE_HAL_TIM_TIMER_CI_ENABLE; vp_config->rti.btimer_val = (VXGE_DEFAULT_RTI_BTIMER_VAL * 1000) / 272; vp_config->rti.rtimer_val = (VXGE_DEFAULT_RTI_RTIMER_VAL * 1000) / 272; vp_config->rti.ltimer_val = (VXGE_DEFAULT_RTI_LTIMER_VAL * 1000) / 272; if ((no_of_vpath > 1) && (VXGE_DEFAULT_CONFIG_MQ_ENABLE == 0)) continue; vp_config->fifo.enable = VXGE_HAL_FIFO_ENABLE; vp_config->fifo.max_aligned_frags = VXGE_DEFAULT_FIFO_ALIGNED_FRAGS; vp_config->tti.intr_enable = VXGE_HAL_TIM_INTR_ENABLE; vp_config->tti.txfrm_cnt_en = VXGE_HAL_TXFRM_CNT_EN_ENABLE; vp_config->tti.util_sel = VXGE_HAL_TIM_UTIL_SEL_LEGACY_TX_NET_UTIL; vp_config->tti.uec_a = VXGE_DEFAULT_TTI_TX_UFC_A; vp_config->tti.uec_b = VXGE_DEFAULT_TTI_TX_UFC_B; vp_config->tti.uec_c = VXGE_DEFAULT_TTI_TX_UFC_C; vp_config->tti.uec_d = VXGE_DEFAULT_TTI_TX_UFC_D; vp_config->tti.urange_a = VXGE_DEFAULT_TTI_TX_URANGE_A; vp_config->tti.urange_b = VXGE_DEFAULT_TTI_TX_URANGE_B; vp_config->tti.urange_c = VXGE_DEFAULT_TTI_TX_URANGE_C; vp_config->tti.timer_ac_en = VXGE_HAL_TIM_TIMER_AC_ENABLE; vp_config->tti.timer_ci_en = VXGE_HAL_TIM_TIMER_CI_ENABLE; vp_config->tti.btimer_val = (VXGE_DEFAULT_TTI_BTIMER_VAL * 1000) / 272; vp_config->tti.rtimer_val = (VXGE_DEFAULT_TTI_RTIMER_VAL * 1000) / 272; vp_config->tti.ltimer_val = (VXGE_DEFAULT_TTI_LTIMER_VAL * 1000) / 272; } vdev->no_of_vpath = no_of_vpath; if (vdev->no_of_vpath == 1) vdev->config.tx_steering = 0; if (vdev->config.rth_enable && (vdev->no_of_vpath > 1)) { device_config->rth_en = VXGE_HAL_RTH_ENABLE; device_config->rth_it_type = VXGE_HAL_RTH_IT_TYPE_MULTI_IT; } vdev->config.rth_enable = device_config->rth_en; } /* * vxge_vpath_cb_fn * Virtual path Callback function */ /* ARGSUSED */ static vxge_hal_status_e vxge_vpath_cb_fn(vxge_hal_client_h client_handle, vxge_hal_up_msg_h msgh, vxge_hal_message_type_e msg_type, vxge_hal_obj_id_t obj_id, vxge_hal_result_e result, vxge_hal_opaque_handle_t *opaque_handle) { return (VXGE_HAL_OK); } /* * vxge_vpath_open */ int vxge_vpath_open(vxge_dev_t *vdev) { int i, err = EINVAL; u64 func_id; vxge_vpath_t *vpath; vxge_hal_vpath_attr_t vpath_attr; vxge_hal_status_e status = VXGE_HAL_OK; struct lro_ctrl *lro = NULL; bzero(&vpath_attr, sizeof(vxge_hal_vpath_attr_t)); for (i = 0; i < vdev->no_of_vpath; i++) { vpath = &(vdev->vpaths[i]); lro = &vpath->lro; /* Vpath vpath_attr: FIFO */ vpath_attr.vp_id = vpath->vp_id; vpath_attr.fifo_attr.callback = vxge_tx_compl; vpath_attr.fifo_attr.txdl_init = vxge_tx_replenish; vpath_attr.fifo_attr.txdl_term = vxge_tx_term; vpath_attr.fifo_attr.userdata = vpath; vpath_attr.fifo_attr.per_txdl_space = sizeof(vxge_txdl_priv_t); /* Vpath vpath_attr: Ring */ vpath_attr.ring_attr.callback = vxge_rx_compl; vpath_attr.ring_attr.rxd_init = vxge_rx_replenish; vpath_attr.ring_attr.rxd_term = vxge_rx_term; vpath_attr.ring_attr.userdata = vpath; vpath_attr.ring_attr.per_rxd_space = sizeof(vxge_rxd_priv_t); err = vxge_dma_tags_create(vpath); if (err != 0) { device_printf(vdev->ndev, "failed to create dma tags\n"); break; } #if __FreeBSD_version >= 800000 vpath->br = buf_ring_alloc(VXGE_DEFAULT_BR_SIZE, M_DEVBUF, M_WAITOK, &vpath->mtx_tx); if (vpath->br == NULL) { err = ENOMEM; break; } #endif status = vxge_hal_vpath_open(vdev->devh, &vpath_attr, (vxge_hal_vpath_callback_f) vxge_vpath_cb_fn, NULL, &vpath->handle); if (status != VXGE_HAL_OK) { device_printf(vdev->ndev, "failed to open vpath (%d)\n", vpath->vp_id); err = EPERM; break; } vpath->is_open = TRUE; vdev->vpath_handles[i] = vpath->handle; vpath->tx_ticks = ticks; vpath->rx_ticks = ticks; vpath->tti_rtimer_val = VXGE_DEFAULT_TTI_RTIMER_VAL; vpath->tti_rtimer_val = VXGE_DEFAULT_TTI_RTIMER_VAL; vpath->tx_intr_coalesce = vdev->config.intr_coalesce; vpath->rx_intr_coalesce = vdev->config.intr_coalesce; func_id = vdev->config.hw_info.func_id; if (vdev->config.low_latency && (vdev->config.bw_info[func_id].priority == VXGE_DEFAULT_VPATH_PRIORITY_HIGH)) { vpath->tx_intr_coalesce = 0; } if (vdev->ifp->if_capenable & IFCAP_LRO) { err = tcp_lro_init(lro); if (err != 0) { device_printf(vdev->ndev, "LRO Initialization failed!\n"); break; } vpath->lro_enable = TRUE; lro->ifp = vdev->ifp; } } return (err); } void vxge_tso_config(vxge_dev_t *vdev) { u32 func_id, priority; vxge_hal_status_e status = VXGE_HAL_OK; vdev->ifp->if_capabilities |= IFCAP_TSO4; status = vxge_bw_priority_get(vdev, NULL); if (status == VXGE_HAL_OK) { func_id = vdev->config.hw_info.func_id; priority = vdev->config.bw_info[func_id].priority; if (priority != VXGE_DEFAULT_VPATH_PRIORITY_HIGH) vdev->ifp->if_capabilities &= ~IFCAP_TSO4; } #if __FreeBSD_version >= 800000 if (vdev->ifp->if_capabilities & IFCAP_TSO4) vdev->ifp->if_capabilities |= IFCAP_VLAN_HWTSO; #endif } vxge_hal_status_e vxge_bw_priority_get(vxge_dev_t *vdev, vxge_bw_info_t *bw_info) { u32 priority, bandwidth; u32 vpath_count; u64 func_id, func_mode, vpath_list[VXGE_HAL_MAX_VIRTUAL_PATHS]; vxge_hal_status_e status = VXGE_HAL_OK; func_id = vdev->config.hw_info.func_id; if (bw_info) { func_id = bw_info->func_id; func_mode = vdev->config.hw_info.function_mode; if ((is_single_func(func_mode)) && (func_id > 0)) return (VXGE_HAL_FAIL); } if (vdev->hw_fw_version >= VXGE_FW_VERSION(1, 8, 0)) { status = vxge_hal_vf_rx_bw_get(vdev->devh, func_id, &bandwidth, &priority); } else { status = vxge_hal_get_vpath_list(vdev->devh, func_id, vpath_list, &vpath_count); if (status == VXGE_HAL_OK) { status = vxge_hal_bw_priority_get(vdev->devh, vpath_list[0], &bandwidth, &priority); } } if (status == VXGE_HAL_OK) { if (bw_info) { bw_info->priority = priority; bw_info->bandwidth = bandwidth; } else { vdev->config.bw_info[func_id].priority = priority; vdev->config.bw_info[func_id].bandwidth = bandwidth; } } return (status); } /* * close vpaths */ void vxge_vpath_close(vxge_dev_t *vdev) { int i; vxge_vpath_t *vpath; for (i = 0; i < vdev->no_of_vpath; i++) { vpath = &(vdev->vpaths[i]); if (vpath->handle) vxge_hal_vpath_close(vpath->handle); #if __FreeBSD_version >= 800000 if (vpath->br != NULL) buf_ring_free(vpath->br, M_DEVBUF); #endif /* Free LRO memory */ if (vpath->lro_enable) tcp_lro_free(&vpath->lro); if (vpath->dma_tag_rx) { bus_dmamap_destroy(vpath->dma_tag_rx, vpath->extra_dma_map); bus_dma_tag_destroy(vpath->dma_tag_rx); } if (vpath->dma_tag_tx) bus_dma_tag_destroy(vpath->dma_tag_tx); vpath->handle = NULL; vpath->is_open = FALSE; } } /* * reset vpaths */ void vxge_vpath_reset(vxge_dev_t *vdev) { int i; vxge_hal_vpath_h vpath_handle; vxge_hal_status_e status = VXGE_HAL_OK; for (i = 0; i < vdev->no_of_vpath; i++) { vpath_handle = vxge_vpath_handle_get(vdev, i); if (!vpath_handle) continue; status = vxge_hal_vpath_reset(vpath_handle); if (status != VXGE_HAL_OK) device_printf(vdev->ndev, "failed to reset vpath :%d\n", i); } } static inline int vxge_vpath_get(vxge_dev_t *vdev, mbuf_t mhead) { struct tcphdr *th = NULL; struct udphdr *uh = NULL; struct ip *ip = NULL; struct ip6_hdr *ip6 = NULL; struct ether_vlan_header *eth = NULL; void *ulp = NULL; int ehdrlen, iphlen = 0; u8 ipproto = 0; u16 etype, src_port, dst_port; u16 queue_len, counter = 0; src_port = dst_port = 0; queue_len = vdev->no_of_vpath; eth = mtod(mhead, struct ether_vlan_header *); if (eth->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eth->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eth->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } switch (etype) { case ETHERTYPE_IP: ip = (struct ip *) (mhead->m_data + ehdrlen); iphlen = ip->ip_hl << 2; ipproto = ip->ip_p; th = (struct tcphdr *) ((caddr_t)ip + iphlen); uh = (struct udphdr *) ((caddr_t)ip + iphlen); break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *) (mhead->m_data + ehdrlen); iphlen = sizeof(struct ip6_hdr); ipproto = ip6->ip6_nxt; ulp = mtod(mhead, char *) + iphlen; th = ((struct tcphdr *) (ulp)); uh = ((struct udphdr *) (ulp)); break; default: break; } switch (ipproto) { case IPPROTO_TCP: src_port = th->th_sport; dst_port = th->th_dport; break; case IPPROTO_UDP: src_port = uh->uh_sport; dst_port = uh->uh_dport; break; default: break; } counter = (ntohs(src_port) + ntohs(dst_port)) & vpath_selector[queue_len - 1]; if (counter >= queue_len) counter = queue_len - 1; return (counter); } static inline vxge_hal_vpath_h vxge_vpath_handle_get(vxge_dev_t *vdev, int i) { return (vdev->vpaths[i].is_open ? vdev->vpaths[i].handle : NULL); } int vxge_firmware_verify(vxge_dev_t *vdev) { int err = 0; u64 active_config; vxge_hal_status_e status = VXGE_HAL_FAIL; if (vdev->fw_upgrade) { status = vxge_firmware_upgrade(vdev); if (status == VXGE_HAL_OK) { err = ENXIO; goto _exit0; } } if ((vdev->config.function_mode != VXGE_DEFAULT_CONFIG_VALUE) && (vdev->config.hw_info.function_mode != (u64) vdev->config.function_mode)) { status = vxge_func_mode_set(vdev); if (status == VXGE_HAL_OK) err = ENXIO; } /* l2_switch configuration */ active_config = VXGE_DEFAULT_CONFIG_VALUE; status = vxge_hal_get_active_config(vdev->devh, VXGE_HAL_XMAC_NWIF_ActConfig_L2SwitchEnabled, &active_config); if (status == VXGE_HAL_OK) { vdev->l2_switch = active_config; if (vdev->config.l2_switch != VXGE_DEFAULT_CONFIG_VALUE) { if (vdev->config.l2_switch != active_config) { status = vxge_l2switch_mode_set(vdev); if (status == VXGE_HAL_OK) err = ENXIO; } } } if (vdev->config.hw_info.ports == VXGE_DUAL_PORT_MODE) { if (vxge_port_mode_update(vdev) == ENXIO) err = ENXIO; } _exit0: if (err == ENXIO) device_printf(vdev->ndev, "PLEASE POWER CYCLE THE SYSTEM\n"); return (err); } vxge_hal_status_e vxge_firmware_upgrade(vxge_dev_t *vdev) { u8 *fw_buffer; u32 fw_size; vxge_hal_device_hw_info_t *hw_info; vxge_hal_status_e status = VXGE_HAL_OK; hw_info = &vdev->config.hw_info; fw_size = sizeof(VXGE_FW_ARRAY_NAME); fw_buffer = (u8 *) VXGE_FW_ARRAY_NAME; device_printf(vdev->ndev, "Current firmware version : %s (%s)\n", hw_info->fw_version.version, hw_info->fw_date.date); device_printf(vdev->ndev, "Upgrading firmware to %d.%d.%d\n", VXGE_MIN_FW_MAJOR_VERSION, VXGE_MIN_FW_MINOR_VERSION, VXGE_MIN_FW_BUILD_NUMBER); /* Call HAL API to upgrade firmware */ status = vxge_hal_mrpcim_fw_upgrade(vdev->pdev, (pci_reg_h) vdev->pdev->reg_map[0], (u8 *) vdev->pdev->bar_info[0], fw_buffer, fw_size); device_printf(vdev->ndev, "firmware upgrade %s\n", (status == VXGE_HAL_OK) ? "successful" : "failed"); return (status); } vxge_hal_status_e vxge_func_mode_set(vxge_dev_t *vdev) { u64 active_config; vxge_hal_status_e status = VXGE_HAL_FAIL; status = vxge_hal_mrpcim_pcie_func_mode_set(vdev->devh, vdev->config.function_mode); device_printf(vdev->ndev, "function mode change %s\n", (status == VXGE_HAL_OK) ? "successful" : "failed"); if (status == VXGE_HAL_OK) { vxge_hal_set_fw_api(vdev->devh, 0ULL, VXGE_HAL_API_FUNC_MODE_COMMIT, 0, 0ULL, 0ULL); vxge_hal_get_active_config(vdev->devh, VXGE_HAL_XMAC_NWIF_ActConfig_NWPortMode, &active_config); /* * If in MF + DP mode * if user changes to SF, change port_mode to single port mode */ if (((is_multi_func(vdev->config.hw_info.function_mode)) && is_single_func(vdev->config.function_mode)) && (active_config == VXGE_HAL_DP_NP_MODE_DUAL_PORT)) { vdev->config.port_mode = VXGE_HAL_DP_NP_MODE_SINGLE_PORT; status = vxge_port_mode_set(vdev); } } return (status); } vxge_hal_status_e vxge_port_mode_set(vxge_dev_t *vdev) { vxge_hal_status_e status = VXGE_HAL_FAIL; status = vxge_hal_set_port_mode(vdev->devh, vdev->config.port_mode); device_printf(vdev->ndev, "port mode change %s\n", (status == VXGE_HAL_OK) ? "successful" : "failed"); if (status == VXGE_HAL_OK) { vxge_hal_set_fw_api(vdev->devh, 0ULL, VXGE_HAL_API_FUNC_MODE_COMMIT, 0, 0ULL, 0ULL); /* Configure vpath_mapping for active-active mode only */ if (vdev->config.port_mode == VXGE_HAL_DP_NP_MODE_DUAL_PORT) { status = vxge_hal_config_vpath_map(vdev->devh, VXGE_DUAL_PORT_MAP); device_printf(vdev->ndev, "dual port map change %s\n", (status == VXGE_HAL_OK) ? "successful" : "failed"); } } return (status); } int vxge_port_mode_update(vxge_dev_t *vdev) { int err = 0; u64 active_config; vxge_hal_status_e status = VXGE_HAL_FAIL; if ((vdev->config.port_mode == VXGE_HAL_DP_NP_MODE_DUAL_PORT) && is_single_func(vdev->config.hw_info.function_mode)) { device_printf(vdev->ndev, "Adapter in SF mode, dual port mode is not allowed\n"); err = EPERM; goto _exit0; } active_config = VXGE_DEFAULT_CONFIG_VALUE; status = vxge_hal_get_active_config(vdev->devh, VXGE_HAL_XMAC_NWIF_ActConfig_NWPortMode, &active_config); if (status != VXGE_HAL_OK) { err = EINVAL; goto _exit0; } vdev->port_mode = active_config; if (vdev->config.port_mode != VXGE_DEFAULT_CONFIG_VALUE) { if (vdev->config.port_mode != vdev->port_mode) { status = vxge_port_mode_set(vdev); if (status != VXGE_HAL_OK) { err = EINVAL; goto _exit0; } err = ENXIO; vdev->port_mode = vdev->config.port_mode; } } active_config = VXGE_DEFAULT_CONFIG_VALUE; status = vxge_hal_get_active_config(vdev->devh, VXGE_HAL_XMAC_NWIF_ActConfig_BehaviourOnFail, &active_config); if (status != VXGE_HAL_OK) { err = EINVAL; goto _exit0; } vdev->port_failure = active_config; /* * active/active mode : set to NoMove * active/passive mode: set to Failover-Failback */ if (vdev->port_mode == VXGE_HAL_DP_NP_MODE_DUAL_PORT) vdev->config.port_failure = VXGE_HAL_XMAC_NWIF_OnFailure_NoMove; else if (vdev->port_mode == VXGE_HAL_DP_NP_MODE_ACTIVE_PASSIVE) vdev->config.port_failure = VXGE_HAL_XMAC_NWIF_OnFailure_OtherPortBackOnRestore; if ((vdev->port_mode != VXGE_HAL_DP_NP_MODE_SINGLE_PORT) && (vdev->config.port_failure != vdev->port_failure)) { status = vxge_port_behavior_on_failure_set(vdev); if (status == VXGE_HAL_OK) err = ENXIO; } _exit0: return (err); } vxge_hal_status_e vxge_port_mode_get(vxge_dev_t *vdev, vxge_port_info_t *port_info) { int err = 0; u64 active_config; vxge_hal_status_e status = VXGE_HAL_FAIL; active_config = VXGE_DEFAULT_CONFIG_VALUE; status = vxge_hal_get_active_config(vdev->devh, VXGE_HAL_XMAC_NWIF_ActConfig_NWPortMode, &active_config); if (status != VXGE_HAL_OK) { err = ENXIO; goto _exit0; } port_info->port_mode = active_config; active_config = VXGE_DEFAULT_CONFIG_VALUE; status = vxge_hal_get_active_config(vdev->devh, VXGE_HAL_XMAC_NWIF_ActConfig_BehaviourOnFail, &active_config); if (status != VXGE_HAL_OK) { err = ENXIO; goto _exit0; } port_info->port_failure = active_config; _exit0: return (err); } vxge_hal_status_e vxge_port_behavior_on_failure_set(vxge_dev_t *vdev) { vxge_hal_status_e status = VXGE_HAL_FAIL; status = vxge_hal_set_behavior_on_failure(vdev->devh, vdev->config.port_failure); device_printf(vdev->ndev, "port behaviour on failure change %s\n", (status == VXGE_HAL_OK) ? "successful" : "failed"); if (status == VXGE_HAL_OK) vxge_hal_set_fw_api(vdev->devh, 0ULL, VXGE_HAL_API_FUNC_MODE_COMMIT, 0, 0ULL, 0ULL); return (status); } void vxge_active_port_update(vxge_dev_t *vdev) { u64 active_config; vxge_hal_status_e status = VXGE_HAL_FAIL; active_config = VXGE_DEFAULT_CONFIG_VALUE; status = vxge_hal_get_active_config(vdev->devh, VXGE_HAL_XMAC_NWIF_ActConfig_ActivePort, &active_config); if (status == VXGE_HAL_OK) vdev->active_port = active_config; } vxge_hal_status_e vxge_l2switch_mode_set(vxge_dev_t *vdev) { vxge_hal_status_e status = VXGE_HAL_FAIL; status = vxge_hal_set_l2switch_mode(vdev->devh, vdev->config.l2_switch); device_printf(vdev->ndev, "L2 switch %s\n", (status == VXGE_HAL_OK) ? (vdev->config.l2_switch) ? "enable" : "disable" : "change failed"); if (status == VXGE_HAL_OK) vxge_hal_set_fw_api(vdev->devh, 0ULL, VXGE_HAL_API_FUNC_MODE_COMMIT, 0, 0ULL, 0ULL); return (status); } /* * vxge_promisc_set * Enable Promiscuous Mode */ void vxge_promisc_set(vxge_dev_t *vdev) { int i; ifnet_t ifp; vxge_hal_vpath_h vpath_handle; if (!vdev->is_initialized) return; ifp = vdev->ifp; for (i = 0; i < vdev->no_of_vpath; i++) { vpath_handle = vxge_vpath_handle_get(vdev, i); if (!vpath_handle) continue; if (ifp->if_flags & IFF_PROMISC) vxge_hal_vpath_promisc_enable(vpath_handle); else vxge_hal_vpath_promisc_disable(vpath_handle); } } /* * vxge_change_mtu * Change interface MTU to a requested valid size */ int vxge_change_mtu(vxge_dev_t *vdev, unsigned long new_mtu) { int err = EINVAL; if ((new_mtu < VXGE_HAL_MIN_MTU) || (new_mtu > VXGE_HAL_MAX_MTU)) goto _exit0; (vdev->ifp)->if_mtu = new_mtu; device_printf(vdev->ndev, "MTU changed to %u\n", (vdev->ifp)->if_mtu); if (vdev->is_initialized) { if_down(vdev->ifp); vxge_reset(vdev); if_up(vdev->ifp); } err = 0; _exit0: return (err); } /* * Creates DMA tags for both Tx and Rx */ int vxge_dma_tags_create(vxge_vpath_t *vpath) { int err = 0; bus_size_t max_size, boundary; vxge_dev_t *vdev = vpath->vdev; ifnet_t ifp = vdev->ifp; max_size = ifp->if_mtu + VXGE_HAL_MAC_HEADER_MAX_SIZE + VXGE_HAL_HEADER_ETHERNET_II_802_3_ALIGN; VXGE_BUFFER_ALIGN(max_size, 128) if (max_size <= MCLBYTES) vdev->rx_mbuf_sz = MCLBYTES; else vdev->rx_mbuf_sz = (max_size > MJUMPAGESIZE) ? MJUM9BYTES : MJUMPAGESIZE; boundary = (max_size > PAGE_SIZE) ? 0 : PAGE_SIZE; /* DMA tag for Tx */ err = bus_dma_tag_create( bus_get_dma_tag(vdev->ndev), 1, PAGE_SIZE, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, VXGE_TSO_SIZE, VXGE_MAX_SEGS, PAGE_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &(vpath->dma_tag_tx)); if (err != 0) goto _exit0; /* DMA tag for Rx */ err = bus_dma_tag_create( bus_get_dma_tag(vdev->ndev), 1, boundary, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, vdev->rx_mbuf_sz, 1, vdev->rx_mbuf_sz, BUS_DMA_ALLOCNOW, NULL, NULL, &(vpath->dma_tag_rx)); if (err != 0) goto _exit1; /* Create DMA map for this descriptor */ err = bus_dmamap_create(vpath->dma_tag_rx, BUS_DMA_NOWAIT, &vpath->extra_dma_map); if (err == 0) goto _exit0; bus_dma_tag_destroy(vpath->dma_tag_rx); _exit1: bus_dma_tag_destroy(vpath->dma_tag_tx); _exit0: return (err); } static inline int vxge_dma_mbuf_coalesce(bus_dma_tag_t dma_tag_tx, bus_dmamap_t dma_map, mbuf_t * m_headp, bus_dma_segment_t * dma_buffers, int *num_segs) { int err = 0; mbuf_t mbuf_pkt = NULL; retry: err = bus_dmamap_load_mbuf_sg(dma_tag_tx, dma_map, *m_headp, dma_buffers, num_segs, BUS_DMA_NOWAIT); if (err == EFBIG) { /* try to defrag, too many segments */ mbuf_pkt = m_defrag(*m_headp, M_NOWAIT); if (mbuf_pkt == NULL) { err = ENOBUFS; goto _exit0; } *m_headp = mbuf_pkt; goto retry; } _exit0: return (err); } int vxge_device_hw_info_get(vxge_dev_t *vdev) { int i, err = ENXIO; u64 vpath_mask = 0; u32 max_supported_vpath = 0; u32 fw_ver_maj_min; vxge_firmware_upgrade_e fw_option; vxge_hal_status_e status = VXGE_HAL_OK; vxge_hal_device_hw_info_t *hw_info; status = vxge_hal_device_hw_info_get(vdev->pdev, (pci_reg_h) vdev->pdev->reg_map[0], (u8 *) vdev->pdev->bar_info[0], &vdev->config.hw_info); if (status != VXGE_HAL_OK) goto _exit0; hw_info = &vdev->config.hw_info; vpath_mask = hw_info->vpath_mask; if (vpath_mask == 0) { device_printf(vdev->ndev, "No vpaths available in device\n"); goto _exit0; } fw_option = vdev->config.fw_option; /* Check how many vpaths are available */ for (i = 0; i < VXGE_HAL_MAX_VIRTUAL_PATHS; i++) { if (!((vpath_mask) & mBIT(i))) continue; max_supported_vpath++; } vdev->max_supported_vpath = max_supported_vpath; status = vxge_hal_device_is_privileged(hw_info->host_type, hw_info->func_id); vdev->is_privilaged = (status == VXGE_HAL_OK) ? TRUE : FALSE; vdev->hw_fw_version = VXGE_FW_VERSION( hw_info->fw_version.major, hw_info->fw_version.minor, hw_info->fw_version.build); fw_ver_maj_min = VXGE_FW_MAJ_MIN_VERSION(hw_info->fw_version.major, hw_info->fw_version.minor); if ((fw_option >= VXGE_FW_UPGRADE_FORCE) || (vdev->hw_fw_version != VXGE_DRV_FW_VERSION)) { /* For fw_ver 1.8.1 and above ignore build number. */ if ((fw_option == VXGE_FW_UPGRADE_ALL) && ((vdev->hw_fw_version >= VXGE_FW_VERSION(1, 8, 1)) && (fw_ver_maj_min == VXGE_DRV_FW_MAJ_MIN_VERSION))) { goto _exit1; } if (vdev->hw_fw_version < VXGE_BASE_FW_VERSION) { device_printf(vdev->ndev, "Upgrade driver through vxge_update, " "Unable to load the driver.\n"); goto _exit0; } vdev->fw_upgrade = TRUE; } _exit1: err = 0; _exit0: return (err); } /* * vxge_device_hw_info_print * Print device and driver information */ void vxge_device_hw_info_print(vxge_dev_t *vdev) { u32 i; device_t ndev; struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; char pmd_type[2][VXGE_PMD_INFO_LEN]; vxge_hal_device_t *hldev; vxge_hal_device_hw_info_t *hw_info; vxge_hal_device_pmd_info_t *pmd_port; hldev = vdev->devh; ndev = vdev->ndev; ctx = device_get_sysctl_ctx(ndev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(ndev)); hw_info = &(vdev->config.hw_info); snprintf(vdev->config.nic_attr[VXGE_PRINT_DRV_VERSION], sizeof(vdev->config.nic_attr[VXGE_PRINT_DRV_VERSION]), "%d.%d.%d.%d", XGELL_VERSION_MAJOR, XGELL_VERSION_MINOR, XGELL_VERSION_FIX, XGELL_VERSION_BUILD); /* Print PCI-e bus type/speed/width info */ snprintf(vdev->config.nic_attr[VXGE_PRINT_PCIE_INFO], sizeof(vdev->config.nic_attr[VXGE_PRINT_PCIE_INFO]), "x%d", hldev->link_width); if (hldev->link_width <= VXGE_HAL_PCI_E_LINK_WIDTH_X4) device_printf(ndev, "For optimal performance a x8 " "PCI-Express slot is required.\n"); vxge_null_terminate((char *) hw_info->serial_number, sizeof(hw_info->serial_number)); vxge_null_terminate((char *) hw_info->part_number, sizeof(hw_info->part_number)); snprintf(vdev->config.nic_attr[VXGE_PRINT_SERIAL_NO], sizeof(vdev->config.nic_attr[VXGE_PRINT_SERIAL_NO]), "%s", hw_info->serial_number); snprintf(vdev->config.nic_attr[VXGE_PRINT_PART_NO], sizeof(vdev->config.nic_attr[VXGE_PRINT_PART_NO]), "%s", hw_info->part_number); snprintf(vdev->config.nic_attr[VXGE_PRINT_FW_VERSION], sizeof(vdev->config.nic_attr[VXGE_PRINT_FW_VERSION]), "%s", hw_info->fw_version.version); snprintf(vdev->config.nic_attr[VXGE_PRINT_FW_DATE], sizeof(vdev->config.nic_attr[VXGE_PRINT_FW_DATE]), "%s", hw_info->fw_date.date); pmd_port = &(hw_info->pmd_port0); for (i = 0; i < hw_info->ports; i++) { vxge_pmd_port_type_get(vdev, pmd_port->type, pmd_type[i], sizeof(pmd_type[i])); strncpy(vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_0 + i], "vendor=??, sn=??, pn=??, type=??", sizeof(vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_0 + i])); vxge_null_terminate(pmd_port->vendor, sizeof(pmd_port->vendor)); if (strlen(pmd_port->vendor) == 0) { pmd_port = &(hw_info->pmd_port1); continue; } vxge_null_terminate(pmd_port->ser_num, sizeof(pmd_port->ser_num)); vxge_null_terminate(pmd_port->part_num, sizeof(pmd_port->part_num)); snprintf(vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_0 + i], sizeof(vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_0 + i]), "vendor=%s, sn=%s, pn=%s, type=%s", pmd_port->vendor, pmd_port->ser_num, pmd_port->part_num, pmd_type[i]); pmd_port = &(hw_info->pmd_port1); } switch (hw_info->function_mode) { case VXGE_HAL_PCIE_FUNC_MODE_SF1_VP17: snprintf(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE]), "%s %d %s", "Single Function - 1 function(s)", vdev->max_supported_vpath, "VPath(s)/function"); break; case VXGE_HAL_PCIE_FUNC_MODE_MF2_VP8: snprintf(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE]), "%s %d %s", "Multi Function - 2 function(s)", vdev->max_supported_vpath, "VPath(s)/function"); break; case VXGE_HAL_PCIE_FUNC_MODE_MF4_VP4: snprintf(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE]), "%s %d %s", "Multi Function - 4 function(s)", vdev->max_supported_vpath, "VPath(s)/function"); break; case VXGE_HAL_PCIE_FUNC_MODE_MF8_VP2: snprintf(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE]), "%s %d %s", "Multi Function - 8 function(s)", vdev->max_supported_vpath, "VPath(s)/function"); break; case VXGE_HAL_PCIE_FUNC_MODE_MF8P_VP2: snprintf(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE]), "%s %d %s", "Multi Function (DirectIO) - 8 function(s)", vdev->max_supported_vpath, "VPath(s)/function"); break; } snprintf(vdev->config.nic_attr[VXGE_PRINT_INTR_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_INTR_MODE]), "%s", ((vdev->config.intr_mode == VXGE_HAL_INTR_MODE_MSIX) ? "MSI-X" : "INTA")); snprintf(vdev->config.nic_attr[VXGE_PRINT_VPATH_COUNT], sizeof(vdev->config.nic_attr[VXGE_PRINT_VPATH_COUNT]), "%d", vdev->no_of_vpath); snprintf(vdev->config.nic_attr[VXGE_PRINT_MTU_SIZE], sizeof(vdev->config.nic_attr[VXGE_PRINT_MTU_SIZE]), "%u", vdev->ifp->if_mtu); snprintf(vdev->config.nic_attr[VXGE_PRINT_LRO_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_LRO_MODE]), "%s", ((vdev->config.lro_enable) ? "Enabled" : "Disabled")); snprintf(vdev->config.nic_attr[VXGE_PRINT_RTH_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_RTH_MODE]), "%s", ((vdev->config.rth_enable) ? "Enabled" : "Disabled")); snprintf(vdev->config.nic_attr[VXGE_PRINT_TSO_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_TSO_MODE]), "%s", ((vdev->ifp->if_capenable & IFCAP_TSO4) ? "Enabled" : "Disabled")); snprintf(vdev->config.nic_attr[VXGE_PRINT_ADAPTER_TYPE], sizeof(vdev->config.nic_attr[VXGE_PRINT_ADAPTER_TYPE]), "%s", ((hw_info->ports == 1) ? "Single Port" : "Dual Port")); if (vdev->is_privilaged) { if (hw_info->ports > 1) { snprintf(vdev->config.nic_attr[VXGE_PRINT_PORT_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_PORT_MODE]), "%s", vxge_port_mode[vdev->port_mode]); if (vdev->port_mode != VXGE_HAL_DP_NP_MODE_SINGLE_PORT) snprintf(vdev->config.nic_attr[VXGE_PRINT_PORT_FAILURE], sizeof(vdev->config.nic_attr[VXGE_PRINT_PORT_FAILURE]), "%s", vxge_port_failure[vdev->port_failure]); vxge_active_port_update(vdev); snprintf(vdev->config.nic_attr[VXGE_PRINT_ACTIVE_PORT], sizeof(vdev->config.nic_attr[VXGE_PRINT_ACTIVE_PORT]), "%lld", vdev->active_port); } if (!is_single_func(hw_info->function_mode)) { snprintf(vdev->config.nic_attr[VXGE_PRINT_L2SWITCH_MODE], sizeof(vdev->config.nic_attr[VXGE_PRINT_L2SWITCH_MODE]), "%s", ((vdev->l2_switch) ? "Enabled" : "Disabled")); } } device_printf(ndev, "Driver version\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_DRV_VERSION]); device_printf(ndev, "Serial number\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_SERIAL_NO]); device_printf(ndev, "Part number\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_PART_NO]); device_printf(ndev, "Firmware version\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_FW_VERSION]); device_printf(ndev, "Firmware date\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_FW_DATE]); device_printf(ndev, "Link width\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_PCIE_INFO]); if (vdev->is_privilaged) { device_printf(ndev, "Function mode\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE]); } device_printf(ndev, "Interrupt type\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_INTR_MODE]); device_printf(ndev, "VPath(s) opened\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_VPATH_COUNT]); device_printf(ndev, "Adapter Type\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_ADAPTER_TYPE]); device_printf(ndev, "PMD Port 0\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_0]); if (hw_info->ports > 1) { device_printf(ndev, "PMD Port 1\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_1]); if (vdev->is_privilaged) { device_printf(ndev, "Port Mode\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_PORT_MODE]); if (vdev->port_mode != VXGE_HAL_DP_NP_MODE_SINGLE_PORT) device_printf(ndev, "Port Failure\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_PORT_FAILURE]); device_printf(vdev->ndev, "Active Port\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_ACTIVE_PORT]); } } if (vdev->is_privilaged && !is_single_func(hw_info->function_mode)) { device_printf(vdev->ndev, "L2 Switch\t: %s\n", vdev->config.nic_attr[VXGE_PRINT_L2SWITCH_MODE]); } device_printf(ndev, "MTU is %s\n", vdev->config.nic_attr[VXGE_PRINT_MTU_SIZE]); device_printf(ndev, "LRO %s\n", vdev->config.nic_attr[VXGE_PRINT_LRO_MODE]); device_printf(ndev, "RTH %s\n", vdev->config.nic_attr[VXGE_PRINT_RTH_MODE]); device_printf(ndev, "TSO %s\n", vdev->config.nic_attr[VXGE_PRINT_TSO_MODE]); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Driver version", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_DRV_VERSION], 0, "Driver version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Serial number", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_SERIAL_NO], 0, "Serial number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Part number", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_PART_NO], 0, "Part number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Firmware version", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_FW_VERSION], 0, "Firmware version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Firmware date", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_FW_DATE], 0, "Firmware date"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Link width", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_PCIE_INFO], 0, "Link width"); if (vdev->is_privilaged) { SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Function mode", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_FUNC_MODE], 0, "Function mode"); } SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Interrupt type", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_INTR_MODE], 0, "Interrupt type"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "VPath(s) opened", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_VPATH_COUNT], 0, "VPath(s) opened"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Adapter Type", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_ADAPTER_TYPE], 0, "Adapter Type"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pmd port 0", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_0], 0, "pmd port"); if (hw_info->ports > 1) { SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pmd port 1", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_PMD_PORTS_1], 0, "pmd port"); if (vdev->is_privilaged) { SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Port Mode", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_PORT_MODE], 0, "Port Mode"); if (vdev->port_mode != VXGE_HAL_DP_NP_MODE_SINGLE_PORT) SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Port Failure", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_PORT_FAILURE], 0, "Port Failure"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "L2 Switch", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_L2SWITCH_MODE], 0, "L2 Switch"); } } SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "LRO mode", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_LRO_MODE], 0, "LRO mode"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "RTH mode", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_RTH_MODE], 0, "RTH mode"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "TSO mode", CTLFLAG_RD, vdev->config.nic_attr[VXGE_PRINT_TSO_MODE], 0, "TSO mode"); } void vxge_pmd_port_type_get(vxge_dev_t *vdev, u32 port_type, char *ifm_name, u8 ifm_len) { vdev->ifm_optics = IFM_UNKNOWN; switch (port_type) { case VXGE_HAL_DEVICE_PMD_TYPE_10G_SR: vdev->ifm_optics = IFM_10G_SR; strlcpy(ifm_name, "10GbE SR", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_10G_LR: vdev->ifm_optics = IFM_10G_LR; strlcpy(ifm_name, "10GbE LR", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_10G_LRM: vdev->ifm_optics = IFM_10G_LRM; strlcpy(ifm_name, "10GbE LRM", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_10G_DIRECT: vdev->ifm_optics = IFM_10G_TWINAX; strlcpy(ifm_name, "10GbE DA (Direct Attached)", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_10G_CX4: vdev->ifm_optics = IFM_10G_CX4; strlcpy(ifm_name, "10GbE CX4", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_10G_BASE_T: #if __FreeBSD_version >= 800000 vdev->ifm_optics = IFM_10G_T; #endif strlcpy(ifm_name, "10GbE baseT", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_10G_OTHER: strlcpy(ifm_name, "10GbE Other", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_1G_SX: vdev->ifm_optics = IFM_1000_SX; strlcpy(ifm_name, "1GbE SX", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_1G_LX: vdev->ifm_optics = IFM_1000_LX; strlcpy(ifm_name, "1GbE LX", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_1G_CX: vdev->ifm_optics = IFM_1000_CX; strlcpy(ifm_name, "1GbE CX", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_1G_BASE_T: vdev->ifm_optics = IFM_1000_T; strlcpy(ifm_name, "1GbE baseT", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_1G_DIRECT: strlcpy(ifm_name, "1GbE DA (Direct Attached)", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_1G_CX4: strlcpy(ifm_name, "1GbE CX4", ifm_len); break; case VXGE_HAL_DEVICE_PMD_TYPE_1G_OTHER: strlcpy(ifm_name, "1GbE Other", ifm_len); break; default: case VXGE_HAL_DEVICE_PMD_TYPE_UNKNOWN: strlcpy(ifm_name, "UNSUP", ifm_len); break; } } u32 vxge_ring_length_get(u32 buffer_mode) { return (VXGE_DEFAULT_RING_BLOCK * vxge_hal_ring_rxds_per_block_get(buffer_mode)); } /* * Removes trailing spaces padded * and NULL terminates strings */ static inline void vxge_null_terminate(char *str, size_t len) { len--; while (*str && (*str != ' ') && (len != 0)) ++str; --len; if (*str) *str = '\0'; } /* * vxge_ioctl * Callback to control the device */ int vxge_ioctl(ifnet_t ifp, u_long command, caddr_t data) { int mask, err = 0; vxge_dev_t *vdev = (vxge_dev_t *) ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; if (!vdev->is_active) return (EBUSY); switch (command) { /* Set/Get ifnet address */ case SIOCSIFADDR: case SIOCGIFADDR: ether_ioctl(ifp, command, data); break; /* Set Interface MTU */ case SIOCSIFMTU: err = vxge_change_mtu(vdev, (unsigned long)ifr->ifr_mtu); break; /* Set Interface Flags */ case SIOCSIFFLAGS: VXGE_DRV_LOCK(vdev); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ vdev->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) vxge_promisc_set(vdev); } else { vxge_init_locked(vdev); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vxge_stop_locked(vdev); } vdev->if_flags = ifp->if_flags; VXGE_DRV_UNLOCK(vdev); break; /* Add/delete multicast address */ case SIOCADDMULTI: case SIOCDELMULTI: break; /* Get/Set Interface Media */ case SIOCSIFMEDIA: case SIOCGIFMEDIA: err = ifmedia_ioctl(ifp, ifr, &vdev->media, command); break; /* Set Capabilities */ case SIOCSIFCAP: VXGE_DRV_LOCK(vdev); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if ((ifp->if_capenable & IFCAP_TSO) && !(ifp->if_capenable & IFCAP_TXCSUM)) { ifp->if_capenable &= ~IFCAP_TSO; ifp->if_hwassist &= ~CSUM_TSO; if_printf(ifp, "TSO Disabled\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (ifp->if_capenable & IFCAP_TSO) { if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= CSUM_TSO; if_printf(ifp, "TSO Enabled\n"); } else { ifp->if_capenable &= ~IFCAP_TSO; ifp->if_hwassist &= ~CSUM_TSO; if_printf(ifp, "Enable tx checksum offload \ first.\n"); err = EAGAIN; } } else { ifp->if_hwassist &= ~CSUM_TSO; if_printf(ifp, "TSO Disabled\n"); } } if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_MTU) ifp->if_capenable ^= IFCAP_VLAN_MTU; if (mask & IFCAP_VLAN_HWCSUM) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; #if __FreeBSD_version >= 800000 if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; #endif #if defined(VLAN_CAPABILITIES) VLAN_CAPABILITIES(ifp); #endif VXGE_DRV_UNLOCK(vdev); break; case SIOCGPRIVATE_0: VXGE_DRV_LOCK(vdev); err = vxge_ioctl_stats(vdev, ifr); VXGE_DRV_UNLOCK(vdev); break; case SIOCGPRIVATE_1: VXGE_DRV_LOCK(vdev); err = vxge_ioctl_regs(vdev, ifr); VXGE_DRV_UNLOCK(vdev); break; default: err = ether_ioctl(ifp, command, data); break; } return (err); } /* * vxge_ioctl_regs * IOCTL to get registers */ int vxge_ioctl_regs(vxge_dev_t *vdev, struct ifreq *ifr) { u64 value = 0x0; u32 vp_id = 0; u32 offset, reqd_size = 0; int i, err = EINVAL; char *command = (char *) ifr->ifr_data; void *reg_info = (void *) ifr->ifr_data; vxge_vpath_t *vpath; vxge_hal_status_e status = VXGE_HAL_OK; vxge_hal_mgmt_reg_type_e regs_type; switch (*command) { case vxge_hal_mgmt_reg_type_pcicfgmgmt: if (vdev->is_privilaged) { reqd_size = sizeof(vxge_hal_pcicfgmgmt_reg_t); regs_type = vxge_hal_mgmt_reg_type_pcicfgmgmt; } break; case vxge_hal_mgmt_reg_type_mrpcim: if (vdev->is_privilaged) { reqd_size = sizeof(vxge_hal_mrpcim_reg_t); regs_type = vxge_hal_mgmt_reg_type_mrpcim; } break; case vxge_hal_mgmt_reg_type_srpcim: if (vdev->is_privilaged) { reqd_size = sizeof(vxge_hal_srpcim_reg_t); regs_type = vxge_hal_mgmt_reg_type_srpcim; } break; case vxge_hal_mgmt_reg_type_memrepair: if (vdev->is_privilaged) { /* reqd_size = sizeof(vxge_hal_memrepair_reg_t); */ regs_type = vxge_hal_mgmt_reg_type_memrepair; } break; case vxge_hal_mgmt_reg_type_legacy: reqd_size = sizeof(vxge_hal_legacy_reg_t); regs_type = vxge_hal_mgmt_reg_type_legacy; break; case vxge_hal_mgmt_reg_type_toc: reqd_size = sizeof(vxge_hal_toc_reg_t); regs_type = vxge_hal_mgmt_reg_type_toc; break; case vxge_hal_mgmt_reg_type_common: reqd_size = sizeof(vxge_hal_common_reg_t); regs_type = vxge_hal_mgmt_reg_type_common; break; case vxge_hal_mgmt_reg_type_vpmgmt: reqd_size = sizeof(vxge_hal_vpmgmt_reg_t); regs_type = vxge_hal_mgmt_reg_type_vpmgmt; vpath = &(vdev->vpaths[*((u32 *) reg_info + 1)]); vp_id = vpath->vp_id; break; case vxge_hal_mgmt_reg_type_vpath: reqd_size = sizeof(vxge_hal_vpath_reg_t); regs_type = vxge_hal_mgmt_reg_type_vpath; vpath = &(vdev->vpaths[*((u32 *) reg_info + 1)]); vp_id = vpath->vp_id; break; case VXGE_GET_VPATH_COUNT: *((u32 *) reg_info) = vdev->no_of_vpath; err = 0; break; default: reqd_size = 0; break; } if (reqd_size) { for (i = 0, offset = 0; offset < reqd_size; i++, offset += 0x0008) { value = 0x0; status = vxge_hal_mgmt_reg_read(vdev->devh, regs_type, vp_id, offset, &value); err = (status != VXGE_HAL_OK) ? EINVAL : 0; if (err == EINVAL) break; *((u64 *) ((u64 *) reg_info + i)) = value; } } return (err); } /* * vxge_ioctl_stats * IOCTL to get statistics */ int vxge_ioctl_stats(vxge_dev_t *vdev, struct ifreq *ifr) { int i, retsize, err = EINVAL; u32 bufsize; vxge_vpath_t *vpath; vxge_bw_info_t *bw_info; vxge_port_info_t *port_info; vxge_drv_stats_t *drv_stat; char *buffer = NULL; char *command = (char *) ifr->ifr_data; vxge_hal_status_e status = VXGE_HAL_OK; switch (*command) { case VXGE_GET_PCI_CONF: bufsize = VXGE_STATS_BUFFER_SIZE; buffer = (char *) vxge_mem_alloc(bufsize); if (buffer != NULL) { status = vxge_hal_aux_pci_config_read(vdev->devh, bufsize, buffer, &retsize); if (status == VXGE_HAL_OK) err = copyout(buffer, ifr->ifr_data, retsize); else device_printf(vdev->ndev, "failed pciconfig statistics query\n"); vxge_mem_free(buffer, bufsize); } break; case VXGE_GET_MRPCIM_STATS: if (!vdev->is_privilaged) break; bufsize = VXGE_STATS_BUFFER_SIZE; buffer = (char *) vxge_mem_alloc(bufsize); if (buffer != NULL) { status = vxge_hal_aux_stats_mrpcim_read(vdev->devh, bufsize, buffer, &retsize); if (status == VXGE_HAL_OK) err = copyout(buffer, ifr->ifr_data, retsize); else device_printf(vdev->ndev, "failed mrpcim statistics query\n"); vxge_mem_free(buffer, bufsize); } break; case VXGE_GET_DEVICE_STATS: bufsize = VXGE_STATS_BUFFER_SIZE; buffer = (char *) vxge_mem_alloc(bufsize); if (buffer != NULL) { status = vxge_hal_aux_stats_device_read(vdev->devh, bufsize, buffer, &retsize); if (status == VXGE_HAL_OK) err = copyout(buffer, ifr->ifr_data, retsize); else device_printf(vdev->ndev, "failed device statistics query\n"); vxge_mem_free(buffer, bufsize); } break; case VXGE_GET_DEVICE_HWINFO: bufsize = sizeof(vxge_device_hw_info_t); buffer = (char *) vxge_mem_alloc(bufsize); if (buffer != NULL) { vxge_os_memcpy( &(((vxge_device_hw_info_t *) buffer)->hw_info), &vdev->config.hw_info, sizeof(vxge_hal_device_hw_info_t)); ((vxge_device_hw_info_t *) buffer)->port_mode = vdev->port_mode; ((vxge_device_hw_info_t *) buffer)->port_failure = vdev->port_failure; err = copyout(buffer, ifr->ifr_data, bufsize); if (err != 0) device_printf(vdev->ndev, "failed device hardware info query\n"); vxge_mem_free(buffer, bufsize); } break; case VXGE_GET_DRIVER_STATS: bufsize = sizeof(vxge_drv_stats_t) * vdev->no_of_vpath; drv_stat = (vxge_drv_stats_t *) vxge_mem_alloc(bufsize); if (drv_stat != NULL) { for (i = 0; i < vdev->no_of_vpath; i++) { vpath = &(vdev->vpaths[i]); vpath->driver_stats.rx_lro_queued += vpath->lro.lro_queued; vpath->driver_stats.rx_lro_flushed += vpath->lro.lro_flushed; vxge_os_memcpy(&drv_stat[i], &(vpath->driver_stats), sizeof(vxge_drv_stats_t)); } err = copyout(drv_stat, ifr->ifr_data, bufsize); if (err != 0) device_printf(vdev->ndev, "failed driver statistics query\n"); vxge_mem_free(drv_stat, bufsize); } break; case VXGE_GET_BANDWIDTH: bw_info = (vxge_bw_info_t *) ifr->ifr_data; if ((vdev->config.hw_info.func_id != 0) && (vdev->hw_fw_version < VXGE_FW_VERSION(1, 8, 0))) break; if (vdev->config.hw_info.func_id != 0) bw_info->func_id = vdev->config.hw_info.func_id; status = vxge_bw_priority_get(vdev, bw_info); if (status != VXGE_HAL_OK) break; err = copyout(bw_info, ifr->ifr_data, sizeof(vxge_bw_info_t)); break; case VXGE_SET_BANDWIDTH: if (vdev->is_privilaged) err = vxge_bw_priority_set(vdev, ifr); break; case VXGE_SET_PORT_MODE: if (vdev->is_privilaged) { if (vdev->config.hw_info.ports == VXGE_DUAL_PORT_MODE) { port_info = (vxge_port_info_t *) ifr->ifr_data; vdev->config.port_mode = port_info->port_mode; err = vxge_port_mode_update(vdev); if (err != ENXIO) err = VXGE_HAL_FAIL; else { err = VXGE_HAL_OK; device_printf(vdev->ndev, "PLEASE POWER CYCLE THE SYSTEM\n"); } } } break; case VXGE_GET_PORT_MODE: if (vdev->is_privilaged) { if (vdev->config.hw_info.ports == VXGE_DUAL_PORT_MODE) { port_info = (vxge_port_info_t *) ifr->ifr_data; err = vxge_port_mode_get(vdev, port_info); if (err == VXGE_HAL_OK) { err = copyout(port_info, ifr->ifr_data, sizeof(vxge_port_info_t)); } } } break; default: break; } return (err); } int vxge_bw_priority_config(vxge_dev_t *vdev) { u32 i; int err = EINVAL; for (i = 0; i < vdev->no_of_func; i++) { err = vxge_bw_priority_update(vdev, i, TRUE); if (err != 0) break; } return (err); } int vxge_bw_priority_set(vxge_dev_t *vdev, struct ifreq *ifr) { int err; u32 func_id; vxge_bw_info_t *bw_info; bw_info = (vxge_bw_info_t *) ifr->ifr_data; func_id = bw_info->func_id; vdev->config.bw_info[func_id].priority = bw_info->priority; vdev->config.bw_info[func_id].bandwidth = bw_info->bandwidth; err = vxge_bw_priority_update(vdev, func_id, FALSE); return (err); } int vxge_bw_priority_update(vxge_dev_t *vdev, u32 func_id, bool binit) { u32 i, set = 0; u32 bandwidth, priority, vpath_count; u64 vpath_list[VXGE_HAL_MAX_VIRTUAL_PATHS]; vxge_hal_device_t *hldev; vxge_hal_vp_config_t *vp_config; vxge_hal_status_e status = VXGE_HAL_OK; hldev = vdev->devh; status = vxge_hal_get_vpath_list(vdev->devh, func_id, vpath_list, &vpath_count); if (status != VXGE_HAL_OK) return (status); for (i = 0; i < vpath_count; i++) { vp_config = &(hldev->config.vp_config[vpath_list[i]]); /* Configure Bandwidth */ if (vdev->config.bw_info[func_id].bandwidth != VXGE_HAL_VPATH_BW_LIMIT_DEFAULT) { set = 1; bandwidth = vdev->config.bw_info[func_id].bandwidth; if (bandwidth < VXGE_HAL_VPATH_BW_LIMIT_MIN || bandwidth > VXGE_HAL_VPATH_BW_LIMIT_MAX) { bandwidth = VXGE_HAL_VPATH_BW_LIMIT_DEFAULT; } vp_config->bandwidth = bandwidth; } /* * If b/w limiting is enabled on any of the * VFs, then for remaining VFs set the priority to 3 * and b/w limiting to max i.e 10 Gb) */ if (vp_config->bandwidth == VXGE_HAL_VPATH_BW_LIMIT_DEFAULT) vp_config->bandwidth = VXGE_HAL_VPATH_BW_LIMIT_MAX; if (binit && vdev->config.low_latency) { if (func_id == 0) vdev->config.bw_info[func_id].priority = VXGE_DEFAULT_VPATH_PRIORITY_HIGH; } /* Configure Priority */ if (vdev->config.bw_info[func_id].priority != VXGE_HAL_VPATH_PRIORITY_DEFAULT) { set = 1; priority = vdev->config.bw_info[func_id].priority; if (priority < VXGE_HAL_VPATH_PRIORITY_MIN || priority > VXGE_HAL_VPATH_PRIORITY_MAX) { priority = VXGE_HAL_VPATH_PRIORITY_DEFAULT; } vp_config->priority = priority; } else if (vdev->config.low_latency) { set = 1; vp_config->priority = VXGE_DEFAULT_VPATH_PRIORITY_LOW; } if (set == 1) { status = vxge_hal_rx_bw_priority_set(vdev->devh, vpath_list[i]); if (status != VXGE_HAL_OK) break; if (vpath_list[i] < VXGE_HAL_TX_BW_VPATH_LIMIT) { status = vxge_hal_tx_bw_priority_set( vdev->devh, vpath_list[i]); if (status != VXGE_HAL_OK) break; } } } return ((status == VXGE_HAL_OK) ? 0 : EINVAL); } /* * vxge_intr_coalesce_tx * Changes interrupt coalescing if the interrupts are not within a range * Return Value: Nothing */ void vxge_intr_coalesce_tx(vxge_vpath_t *vpath) { u32 timer; if (!vpath->tx_intr_coalesce) return; vpath->tx_interrupts++; if (ticks > vpath->tx_ticks + hz/100) { vpath->tx_ticks = ticks; timer = vpath->tti_rtimer_val; if (vpath->tx_interrupts > VXGE_MAX_TX_INTERRUPT_COUNT) { if (timer != VXGE_TTI_RTIMER_ADAPT_VAL) { vpath->tti_rtimer_val = VXGE_TTI_RTIMER_ADAPT_VAL; vxge_hal_vpath_dynamic_tti_rtimer_set( vpath->handle, vpath->tti_rtimer_val); } } else { if (timer != 0) { vpath->tti_rtimer_val = 0; vxge_hal_vpath_dynamic_tti_rtimer_set( vpath->handle, vpath->tti_rtimer_val); } } vpath->tx_interrupts = 0; } } /* * vxge_intr_coalesce_rx * Changes interrupt coalescing if the interrupts are not within a range * Return Value: Nothing */ void vxge_intr_coalesce_rx(vxge_vpath_t *vpath) { u32 timer; if (!vpath->rx_intr_coalesce) return; vpath->rx_interrupts++; if (ticks > vpath->rx_ticks + hz/100) { vpath->rx_ticks = ticks; timer = vpath->rti_rtimer_val; if (vpath->rx_interrupts > VXGE_MAX_RX_INTERRUPT_COUNT) { if (timer != VXGE_RTI_RTIMER_ADAPT_VAL) { vpath->rti_rtimer_val = VXGE_RTI_RTIMER_ADAPT_VAL; vxge_hal_vpath_dynamic_rti_rtimer_set( vpath->handle, vpath->rti_rtimer_val); } } else { if (timer != 0) { vpath->rti_rtimer_val = 0; vxge_hal_vpath_dynamic_rti_rtimer_set( vpath->handle, vpath->rti_rtimer_val); } } vpath->rx_interrupts = 0; } } /* * vxge_methods FreeBSD device interface entry points */ static device_method_t vxge_methods[] = { DEVMETHOD(device_probe, vxge_probe), DEVMETHOD(device_attach, vxge_attach), DEVMETHOD(device_detach, vxge_detach), DEVMETHOD(device_shutdown, vxge_shutdown), DEVMETHOD_END }; static driver_t vxge_driver = { "vxge", vxge_methods, sizeof(vxge_dev_t), }; static devclass_t vxge_devclass; DRIVER_MODULE(vxge, pci, vxge_driver, vxge_devclass, 0, 0); Index: head/sys/net/flowtable.c =================================================================== --- head/sys/net/flowtable.c (revision 275357) +++ head/sys/net/flowtable.c (revision 275358) @@ -1,1189 +1,1189 @@ /*- * Copyright (c) 2014 Gleb Smirnoff * Copyright (c) 2008-2010, BitGravity Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Neither the name of the BitGravity Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "opt_route.h" #include "opt_mpath.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef FLOWTABLE_HASH_ALL #include #include #include #endif #include #ifdef FLOWTABLE_HASH_ALL #define KEY_PORTS (sizeof(uint16_t) * 2) #define KEY_ADDRS 2 #else #define KEY_PORTS 0 #define KEY_ADDRS 1 #endif #ifdef INET6 #define KEY_ADDR_LEN sizeof(struct in6_addr) #else #define KEY_ADDR_LEN sizeof(struct in_addr) #endif #define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t)) struct flentry { uint32_t f_hash; /* hash flowing forward */ uint32_t f_key[KEYLEN]; /* address(es and ports) */ uint32_t f_uptime; /* uptime at last access */ uint16_t f_fibnum; /* fib index */ #ifdef FLOWTABLE_HASH_ALL uint8_t f_proto; /* protocol */ uint8_t f_flags; /* stale? */ #define FL_STALE 1 #endif SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ struct rtentry *f_rt; /* rtentry for flow */ struct llentry *f_lle; /* llentry for flow */ }; #undef KEYLEN SLIST_HEAD(flist, flentry); /* Make sure we can use pcpu_zone_ptr for struct flist. */ CTASSERT(sizeof(struct flist) == sizeof(void *)); struct flowtable { counter_u64_t *ft_stat; int ft_size; /* * ft_table is a malloc(9)ed array of pointers. Pointers point to * memory from UMA_ZONE_PCPU zone. * ft_masks is per-cpu pointer itself. Each instance points * to a malloc(9)ed bitset, that is private to corresponding CPU. */ struct flist **ft_table; bitstr_t **ft_masks; bitstr_t *ft_tmpmask; }; #define FLOWSTAT_ADD(ft, name, v) \ counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) #define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1) static struct proc *flowcleanerproc; static uint32_t flow_hashjitter; static struct cv flowclean_f_cv; static struct cv flowclean_c_cv; static struct mtx flowclean_lock; static uint32_t flowclean_cycles; /* * TODO: * - add sysctls to resize && flush flow tables * - Add per flowtable sysctls for statistics and configuring timeouts * - add saturation counter to rtentry to support per-packet load-balancing * add flag to indicate round-robin flow, add list lookup from head for flows * - add sysctl / device node / syscall to support exporting and importing * of flows with flag to indicate that a flow was imported so should * not be considered for auto-cleaning * - support explicit connection state (currently only ad-hoc for DSR) * - idetach() cleanup for options VIMAGE builds. */ #ifdef INET static VNET_DEFINE(struct flowtable, ip4_ft); #define V_ip4_ft VNET(ip4_ft) #endif #ifdef INET6 static VNET_DEFINE(struct flowtable, ip6_ft); #define V_ip6_ft VNET(ip6_ft) #endif static uma_zone_t flow_zone; static VNET_DEFINE(int, flowtable_enable) = 1; #define V_flowtable_enable VNET(flowtable_enable) static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable"); SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, &flow_zone, "Maximum number of flows allowed"); static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); static struct flentry * flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t); #ifdef INET static struct flentry * flowtable_lookup_ipv4(struct mbuf *m, struct route *ro) { struct flentry *fle; struct sockaddr_in *sin; struct ip *ip; uint32_t fibnum; #ifdef FLOWTABLE_HASH_ALL uint32_t key[3]; int iphlen; uint16_t sport, dport; uint8_t proto; #endif ip = mtod(m, struct ip *); if (ip->ip_src.s_addr == ip->ip_dst.s_addr || (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) return (NULL); fibnum = M_GETFIB(m); #ifdef FLOWTABLE_HASH_ALL iphlen = ip->ip_hl << 2; proto = ip->ip_p; switch (proto) { case IPPROTO_TCP: { struct tcphdr *th; th = (struct tcphdr *)((char *)ip + iphlen); sport = th->th_sport; dport = th->th_dport; if (th->th_flags & (TH_RST|TH_FIN)) fibnum |= (FL_STALE << 24); break; } case IPPROTO_UDP: { struct udphdr *uh; uh = (struct udphdr *)((char *)ip + iphlen); sport = uh->uh_sport; dport = uh->uh_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh; sh = (struct sctphdr *)((char *)ip + iphlen); sport = sh->src_port; dport = sh->dest_port; /* XXXGL: handle stale? */ break; } default: sport = dport = 0; break; } key[0] = ip->ip_dst.s_addr; key[1] = ip->ip_src.s_addr; key[2] = (dport << 16) | sport; fibnum |= proto << 16; fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t), fibnum); #else /* !FLOWTABLE_HASH_ALL */ fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst, sizeof(struct in_addr), fibnum); #endif /* FLOWTABLE_HASH_ALL */ if (fle == NULL) return (NULL); sin = (struct sockaddr_in *)&ro->ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; return (fle); } #endif /* INET */ #ifdef INET6 /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) \ do { \ int x = (_len) + sizeof(T); \ if ((m)->m_len < x) \ return (NULL); \ p = (mtod(m, char *) + (_len)); \ } while (0) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) static struct flentry * flowtable_lookup_ipv6(struct mbuf *m, struct route *ro) { struct flentry *fle; struct sockaddr_in6 *sin6; struct ip6_hdr *ip6; uint32_t fibnum; #ifdef FLOWTABLE_HASH_ALL uint32_t key[9]; void *ulp; int hlen; uint16_t sport, dport; u_short offset; uint8_t proto; #else uint32_t key[4]; #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_localaddr(&ip6->ip6_dst)) return (NULL); fibnum = M_GETFIB(m); #ifdef FLOWTABLE_HASH_ALL hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; offset = sport = dport = 0; ulp = NULL; while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: case IPPROTO_OSPFIGP: case IPPROTO_PIM: case IPPROTO_CARP: case IPPROTO_ESP: case IPPROTO_NONE: ulp = ip6; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dport = TCP(ulp)->th_dport; sport = TCP(ulp)->th_sport; if (TCP(ulp)->th_flags & (TH_RST|TH_FIN)) fibnum |= (FL_STALE << 24); break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); dport = SCTP(ulp)->src_port; sport = SCTP(ulp)->dest_port; /* XXXGL: handle stale? */ break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dport = UDP(ulp)->uh_dport; sport = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; default: PULLUP_TO(hlen, ulp, struct ip6_ext); break; } } bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr)); key[8] = (dport << 16) | sport; fibnum |= proto << 16; fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t), fibnum); #else /* !FLOWTABLE_HASH_ALL */ bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr), fibnum); #endif /* FLOWTABLE_HASH_ALL */ if (fle == NULL) return (NULL); sin6 = (struct sockaddr_in6 *)&ro->ro_dst; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr)); return (fle); } #endif /* INET6 */ static bitstr_t * flowtable_mask(struct flowtable *ft) { /* * flowtable_free_stale() calls w/o critical section, but * with sched_bind(). Since pointer is stable throughout * ft lifetime, it is safe, otherwise... * * CRITICAL_ASSERT(curthread); */ return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); } static struct flist * flowtable_list(struct flowtable *ft, uint32_t hash) { CRITICAL_ASSERT(curthread); return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); } static int flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle) { if (((fle->f_rt->rt_flags & RTF_HOST) && ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) || (fle->f_rt->rt_ifp == NULL) || !RT_LINK_IS_UP(fle->f_rt->rt_ifp) || (fle->f_lle->la_flags & LLE_VALID) == 0) return (1); if (time_uptime - fle->f_uptime > maxidle) return (1); #ifdef FLOWTABLE_HASH_ALL if (fle->f_flags & FL_STALE) return (1); #endif return (0); } static int flow_full(void) { int count, max; count = uma_zone_get_cur(flow_zone); max = uma_zone_get_max(flow_zone); return (count > (max - (max >> 3))); } static int flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum) { #ifdef FLOWTABLE_HASH_ALL uint8_t proto; proto = (fibnum >> 16) & 0xff; fibnum &= 0xffff; #endif CRITICAL_ASSERT(curthread); /* Microoptimization for IPv4: don't use bcmp(). */ if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) || (bcmp(fle->f_key, key, keylen) == 0)) && fibnum == fle->f_fibnum && #ifdef FLOWTABLE_HASH_ALL proto == fle->f_proto && #endif (fle->f_rt->rt_flags & RTF_UP) && fle->f_rt->rt_ifp != NULL && (fle->f_lle->la_flags & LLE_VALID)) return (1); return (0); } static struct flentry * flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, int keylen, uint32_t fibnum0) { #ifdef INET6 struct route_in6 sro6; #endif #ifdef INET struct route sro; #endif struct route *ro = NULL; struct rtentry *rt; struct lltable *lt = NULL; struct llentry *lle; struct sockaddr_storage *l3addr; struct ifnet *ifp; struct flist *flist; struct flentry *fle, *iter; bitstr_t *mask; uint16_t fibnum = fibnum0; #ifdef FLOWTABLE_HASH_ALL uint8_t proto; proto = (fibnum0 >> 16) & 0xff; fibnum = fibnum0 & 0xffff; #endif /* * This bit of code ends up locking the * same route 3 times (just like ip_output + ether_output) * - at lookup * - in rt_check when called by arpresolve * - dropping the refcount for the rtentry * * This could be consolidated to one if we wrote a variant * of arpresolve with an rt_check variant that expected to * receive the route locked */ #ifdef INET if (ft == &V_ip4_ft) { struct sockaddr_in *sin; ro = &sro; bzero(&sro.ro_dst, sizeof(sro.ro_dst)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr.s_addr = key[0]; } #endif #ifdef INET6 if (ft == &V_ip6_ft) { struct sockaddr_in6 *sin6; ro = (struct route *)&sro6; sin6 = &sro6.ro_dst; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr)); } #endif ro->ro_rt = NULL; #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, hash, fibnum); #else rtalloc_ign_fib(ro, 0, fibnum); #endif if (ro->ro_rt == NULL) return (NULL); rt = ro->ro_rt; ifp = rt->rt_ifp; if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { RTFREE(rt); return (NULL); } #ifdef INET if (ft == &V_ip4_ft) lt = LLTABLE(ifp); #endif #ifdef INET6 if (ft == &V_ip6_ft) lt = LLTABLE6(ifp); #endif if (rt->rt_flags & RTF_GATEWAY) l3addr = (struct sockaddr_storage *)rt->rt_gateway; else l3addr = (struct sockaddr_storage *)&ro->ro_dst; lle = llentry_alloc(ifp, lt, l3addr); if (lle == NULL) { RTFREE(rt); return (NULL); } /* Don't insert the entry if the ARP hasn't yet finished resolving. */ if ((lle->la_flags & LLE_VALID) == 0) { RTFREE(rt); LLE_FREE(lle); FLOWSTAT_INC(ft, ft_fail_lle_invalid); return (NULL); } fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); if (fle == NULL) { RTFREE(rt); LLE_FREE(lle); return (NULL); } fle->f_hash = hash; bcopy(key, &fle->f_key, keylen); fle->f_rt = rt; fle->f_lle = lle; fle->f_fibnum = fibnum; fle->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL fle->f_proto = proto; fle->f_flags = fibnum0 >> 24; #endif critical_enter(); mask = flowtable_mask(ft); flist = flowtable_list(ft, hash); if (SLIST_EMPTY(flist)) { bit_set(mask, (hash % ft->ft_size)); SLIST_INSERT_HEAD(flist, fle, f_next); goto skip; } /* * find end of list and make sure that we were not * preempted by another thread handling this flow */ SLIST_FOREACH(iter, flist, f_next) { KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size, ("%s: wrong hash", __func__)); if (flow_matches(iter, key, keylen, fibnum)) { /* * We probably migrated to an other CPU after * lookup in flowtable_lookup_common() failed. * It appeared that this CPU already has flow * entry. */ iter->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL iter->f_flags |= fibnum >> 24; #endif critical_exit(); FLOWSTAT_INC(ft, ft_collisions); uma_zfree(flow_zone, fle); return (iter); } } SLIST_INSERT_HEAD(flist, fle, f_next); skip: critical_exit(); FLOWSTAT_INC(ft, ft_inserts); return (fle); } int flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) { struct flentry *fle; if (V_flowtable_enable == 0) return (ENXIO); switch (sa) { #ifdef INET case AF_INET: fle = flowtable_lookup_ipv4(m, ro); break; #endif #ifdef INET6 case AF_INET6: fle = flowtable_lookup_ipv6(m, ro); break; #endif default: panic("%s: sa %d", __func__, sa); } if (fle == NULL) return (EHOSTUNREACH); - if (!(m->m_flags & M_FLOWID)) { - m->m_flags |= M_FLOWID; + if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) { + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = fle->f_hash; } ro->ro_rt = fle->f_rt; ro->ro_lle = fle->f_lle; ro->ro_flags |= RT_NORTREF; return (0); } static struct flentry * flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen, uint32_t fibnum) { struct flist *flist; struct flentry *fle; uint32_t hash; FLOWSTAT_INC(ft, ft_lookups); hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter); critical_enter(); flist = flowtable_list(ft, hash); SLIST_FOREACH(fle, flist, f_next) { KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size, ("%s: wrong hash", __func__)); if (flow_matches(fle, key, keylen, fibnum)) { fle->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL fle->f_flags |= fibnum >> 24; #endif critical_exit(); FLOWSTAT_INC(ft, ft_hits); return (fle); } } critical_exit(); FLOWSTAT_INC(ft, ft_misses); return (flowtable_insert(ft, hash, key, keylen, fibnum)); } /* * used by the bit_alloc macro */ #define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO) static void flowtable_alloc(struct flowtable *ft) { ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), M_FTABLE, M_WAITOK); for (int i = 0; i < ft->ft_size; i++) ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK); for (int i = 0; i < mp_ncpus; i++) { bitstr_t **b; b = zpcpu_get_cpu(ft->ft_masks, i); *b = bit_alloc(ft->ft_size); } ft->ft_tmpmask = bit_alloc(ft->ft_size); } #undef calloc static void flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle) { struct flist *flist, freelist; struct flentry *fle, *fle1, *fleprev; bitstr_t *mask, *tmpmask; int curbit, tmpsize; SLIST_INIT(&freelist); mask = flowtable_mask(ft); tmpmask = ft->ft_tmpmask; tmpsize = ft->ft_size; memcpy(tmpmask, mask, ft->ft_size/8); curbit = 0; fleprev = NULL; /* pacify gcc */ /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { if (curbit >= ft->ft_size || curbit < -1) { log(LOG_ALERT, "warning: bad curbit value %d \n", curbit); break; } FLOWSTAT_INC(ft, ft_free_checks); critical_enter(); flist = flowtable_list(ft, curbit); #ifdef DIAGNOSTIC if (SLIST_EMPTY(flist) && curbit > 0) { log(LOG_ALERT, "warning bit=%d set, but no fle found\n", curbit); } #endif SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { if (rt != NULL && fle->f_rt != rt) { fleprev = fle; continue; } if (!flow_stale(ft, fle, maxidle)) { fleprev = fle; continue; } if (fle == SLIST_FIRST(flist)) SLIST_REMOVE_HEAD(flist, f_next); else SLIST_REMOVE_AFTER(fleprev, f_next); SLIST_INSERT_HEAD(&freelist, fle, f_next); } if (SLIST_EMPTY(flist)) bit_clear(mask, curbit); critical_exit(); bit_clear(tmpmask, curbit); tmpmask += (curbit / 8); tmpsize -= (curbit / 8) * 8; bit_ffs(tmpmask, tmpsize, &curbit); } SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { FLOWSTAT_INC(ft, ft_frees); if (fle->f_rt != NULL) RTFREE(fle->f_rt); if (fle->f_lle != NULL) LLE_FREE(fle->f_lle); uma_zfree(flow_zone, fle); } } static void flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle) { int i; CPU_FOREACH(i) { if (smp_started == 1) { thread_lock(curthread); sched_bind(curthread, i); thread_unlock(curthread); } flowtable_free_stale(ft, rt, maxidle); if (smp_started == 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } } } void flowtable_route_flush(sa_family_t sa, struct rtentry *rt) { struct flowtable *ft; switch (sa) { #ifdef INET case AF_INET: ft = &V_ip4_ft; break; #endif #ifdef INET6 case AF_INET6: ft = &V_ip6_ft; break; #endif default: panic("%s: sa %d", __func__, sa); } flowtable_clean_vnet(ft, rt, 0); } static void flowtable_cleaner(void) { VNET_ITERATOR_DECL(vnet_iter); struct thread *td; if (bootverbose) log(LOG_INFO, "flowtable cleaner started\n"); td = curthread; while (1) { uint32_t flowclean_freq, maxidle; /* * The maximum idle time, as well as frequency are arbitrary. */ if (flow_full()) maxidle = 5; else maxidle = 30; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #ifdef INET flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle); #endif #ifdef INET6 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle); #endif CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); if (flow_full()) flowclean_freq = 4*hz; else flowclean_freq = 20*hz; mtx_lock(&flowclean_lock); thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); flowclean_cycles++; cv_broadcast(&flowclean_f_cv); cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq); mtx_unlock(&flowclean_lock); } } static void flowtable_flush(void *unused __unused) { uint64_t start; mtx_lock(&flowclean_lock); start = flowclean_cycles; while (start == flowclean_cycles) { cv_broadcast(&flowclean_c_cv); cv_wait(&flowclean_f_cv, &flowclean_lock); } mtx_unlock(&flowclean_lock); } static struct kproc_desc flow_kp = { "flowcleaner", flowtable_cleaner, &flowcleanerproc }; SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); static int flowtable_get_size(char *name) { int size; if (TUNABLE_INT_FETCH(name, &size)) { if (size < 256) size = 256; if (!powerof2(size)) { printf("%s must be power of 2\n", name); size = 2048; } } else { /* * round up to the next power of 2 */ size = 1 << fls((1024 + maxusers * 64) - 1); } return (size); } static void flowtable_init(const void *unused __unused) { flow_hashjitter = arc4random(); flow_zone = uma_zcreate("flows", sizeof(struct flentry), NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET); uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); cv_init(&flowclean_c_cv, "c_flowcleanwait"); cv_init(&flowclean_f_cv, "f_flowcleanwait"); mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, flowtable_init, NULL); #ifdef INET static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, "Flowtable for IPv4"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); VNET_PCPUSTAT_SYSINIT(ip4_ftstat); VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat); SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat, ip4_ftstat, "Flowtable statistics for IPv4 " "(struct flowtable_stat, net/flowtable.h)"); static void flowtable_init_vnet_v4(const void *unused __unused) { V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); V_ip4_ft.ft_stat = VNET(ip4_ftstat); flowtable_alloc(&V_ip4_ft); } VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, flowtable_init_vnet_v4, NULL); #endif /* INET */ #ifdef INET6 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, "Flowtable for IPv6"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); VNET_PCPUSTAT_SYSINIT(ip6_ftstat); VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat); SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat, ip6_ftstat, "Flowtable statistics for IPv6 " "(struct flowtable_stat, net/flowtable.h)"); static void flowtable_init_vnet_v6(const void *unused __unused) { V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); V_ip6_ft.ft_stat = VNET(ip6_ftstat); flowtable_alloc(&V_ip6_ft); } VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, flowtable_init_vnet_v6, NULL); #endif /* INET6 */ #ifdef DDB static bitstr_t * flowtable_mask_pcpu(struct flowtable *ft, int cpuid) { return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); } static struct flist * flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) { return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); } static void flow_show(struct flowtable *ft, struct flentry *fle) { int idle_time; int rt_valid, ifp_valid; volatile struct rtentry *rt; struct ifnet *ifp = NULL; uint32_t *hashkey = fle->f_key; idle_time = (int)(time_uptime - fle->f_uptime); rt = fle->f_rt; rt_valid = rt != NULL; if (rt_valid) ifp = rt->rt_ifp; ifp_valid = ifp != NULL; #ifdef INET if (ft == &V_ip4_ft) { char daddr[4*sizeof "123"]; #ifdef FLOWTABLE_HASH_ALL char saddr[4*sizeof "123"]; uint16_t sport, dport; #endif inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr); #ifdef FLOWTABLE_HASH_ALL inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); dport = ntohs((uint16_t)(hashkey[2] >> 16)); sport = ntohs((uint16_t)(hashkey[2] & 0xffff)); db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport); #else db_printf("%s ", daddr); #endif } #endif /* INET */ #ifdef INET6 if (ft == &V_ip6_ft) { #ifdef FLOWTABLE_HASH_ALL db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", hashkey[0], hashkey[1], hashkey[2], hashkey[3], hashkey[4], hashkey[5], hashkey[6], hashkey[7], hashkey[8]); #else db_printf("\n\tkey=%08x:%08x:%08x ", hashkey[0], hashkey[1], hashkey[2]); #endif } #endif /* INET6 */ db_printf("hash=%08x idle_time=%03d" "\n\tfibnum=%02d rt=%p", fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt); #ifdef FLOWTABLE_HASH_ALL if (fle->f_flags & FL_STALE) db_printf(" FL_STALE "); #endif if (rt_valid) { if (rt->rt_flags & RTF_UP) db_printf(" RTF_UP "); } if (ifp_valid) { if (ifp->if_flags & IFF_LOOPBACK) db_printf(" IFF_LOOPBACK "); if (ifp->if_flags & IFF_UP) db_printf(" IFF_UP "); if (ifp->if_flags & IFF_POINTOPOINT) db_printf(" IFF_POINTOPOINT "); } db_printf("\n"); } static void flowtable_show(struct flowtable *ft, int cpuid) { int curbit = 0; bitstr_t *mask, *tmpmask; if (cpuid != -1) db_printf("cpu: %d\n", cpuid); mask = flowtable_mask_pcpu(ft, cpuid); tmpmask = ft->ft_tmpmask; memcpy(tmpmask, mask, ft->ft_size/8); /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { struct flist *flist; struct flentry *fle; if (curbit >= ft->ft_size || curbit < -1) { db_printf("warning: bad curbit value %d \n", curbit); break; } flist = flowtable_list_pcpu(ft, curbit, cpuid); SLIST_FOREACH(fle, flist, f_next) flow_show(ft, fle); bit_clear(tmpmask, curbit); bit_ffs(tmpmask, ft->ft_size, &curbit); } } static void flowtable_show_vnet(struct flowtable *ft) { int i; CPU_FOREACH(i) flowtable_show(ft, i); } DB_SHOW_COMMAND(flowtables, db_show_flowtables) { VNET_ITERATOR_DECL(vnet_iter); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #ifdef VIMAGE db_printf("vnet %p\n", vnet_iter); #endif #ifdef INET printf("IPv4:\n"); flowtable_show_vnet(&V_ip4_ft); #endif #ifdef INET6 printf("IPv6:\n"); flowtable_show_vnet(&V_ip6_ft); #endif CURVNET_RESTORE(); } } #endif Index: head/sys/net/ieee8023ad_lacp.c =================================================================== --- head/sys/net/ieee8023ad_lacp.c (revision 275357) +++ head/sys/net/ieee8023ad_lacp.c (revision 275358) @@ -1,2021 +1,2022 @@ /* $NetBSD: ieee8023ad_lacp.c,v 1.3 2005/12/11 12:24:54 christos Exp $ */ /*- * Copyright (c)2005 YAMAMOTO Takashi, * Copyright (c)2008 Andrew Thompson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include /* hz */ #include /* for net/if.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * actor system priority and port priority. * XXX should be configurable. */ #define LACP_SYSTEM_PRIO 0x8000 #define LACP_PORT_PRIO 0x8000 const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 }; static const struct tlv_template lacp_info_tlv_template[] = { { LACP_TYPE_ACTORINFO, sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, { LACP_TYPE_PARTNERINFO, sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, { LACP_TYPE_COLLECTORINFO, sizeof(struct tlvhdr) + sizeof(struct lacp_collectorinfo) }, { 0, 0 }, }; static const struct tlv_template marker_info_tlv_template[] = { { MARKER_TYPE_INFO, sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) }, { 0, 0 }, }; static const struct tlv_template marker_response_tlv_template[] = { { MARKER_TYPE_RESPONSE, sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) }, { 0, 0 }, }; typedef void (*lacp_timer_func_t)(struct lacp_port *); static void lacp_fill_actorinfo(struct lacp_port *, struct lacp_peerinfo *); static void lacp_fill_markerinfo(struct lacp_port *, struct lacp_markerinfo *); static uint64_t lacp_aggregator_bandwidth(struct lacp_aggregator *); static void lacp_suppress_distributing(struct lacp_softc *, struct lacp_aggregator *); static void lacp_transit_expire(void *); static void lacp_update_portmap(struct lacp_softc *); static void lacp_select_active_aggregator(struct lacp_softc *); static uint16_t lacp_compose_key(struct lacp_port *); static int tlv_check(const void *, size_t, const struct tlvhdr *, const struct tlv_template *, boolean_t); static void lacp_tick(void *); static void lacp_fill_aggregator_id(struct lacp_aggregator *, const struct lacp_port *); static void lacp_fill_aggregator_id_peer(struct lacp_peerinfo *, const struct lacp_peerinfo *); static int lacp_aggregator_is_compatible(const struct lacp_aggregator *, const struct lacp_port *); static int lacp_peerinfo_is_compatible(const struct lacp_peerinfo *, const struct lacp_peerinfo *); static struct lacp_aggregator *lacp_aggregator_get(struct lacp_softc *, struct lacp_port *); static void lacp_aggregator_addref(struct lacp_softc *, struct lacp_aggregator *); static void lacp_aggregator_delref(struct lacp_softc *, struct lacp_aggregator *); /* receive machine */ static int lacp_pdu_input(struct lacp_port *, struct mbuf *); static int lacp_marker_input(struct lacp_port *, struct mbuf *); static void lacp_sm_rx(struct lacp_port *, const struct lacpdu *); static void lacp_sm_rx_timer(struct lacp_port *); static void lacp_sm_rx_set_expired(struct lacp_port *); static void lacp_sm_rx_update_ntt(struct lacp_port *, const struct lacpdu *); static void lacp_sm_rx_record_pdu(struct lacp_port *, const struct lacpdu *); static void lacp_sm_rx_update_selected(struct lacp_port *, const struct lacpdu *); static void lacp_sm_rx_record_default(struct lacp_port *); static void lacp_sm_rx_update_default_selected(struct lacp_port *); static void lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *, const struct lacp_peerinfo *); /* mux machine */ static void lacp_sm_mux(struct lacp_port *); static void lacp_set_mux(struct lacp_port *, enum lacp_mux_state); static void lacp_sm_mux_timer(struct lacp_port *); /* periodic transmit machine */ static void lacp_sm_ptx_update_timeout(struct lacp_port *, uint8_t); static void lacp_sm_ptx_tx_schedule(struct lacp_port *); static void lacp_sm_ptx_timer(struct lacp_port *); /* transmit machine */ static void lacp_sm_tx(struct lacp_port *); static void lacp_sm_assert_ntt(struct lacp_port *); static void lacp_run_timers(struct lacp_port *); static int lacp_compare_peerinfo(const struct lacp_peerinfo *, const struct lacp_peerinfo *); static int lacp_compare_systemid(const struct lacp_systemid *, const struct lacp_systemid *); static void lacp_port_enable(struct lacp_port *); static void lacp_port_disable(struct lacp_port *); static void lacp_select(struct lacp_port *); static void lacp_unselect(struct lacp_port *); static void lacp_disable_collecting(struct lacp_port *); static void lacp_enable_collecting(struct lacp_port *); static void lacp_disable_distributing(struct lacp_port *); static void lacp_enable_distributing(struct lacp_port *); static int lacp_xmit_lacpdu(struct lacp_port *); static int lacp_xmit_marker(struct lacp_port *); /* Debugging */ static void lacp_dump_lacpdu(const struct lacpdu *); static const char *lacp_format_partner(const struct lacp_peerinfo *, char *, size_t); static const char *lacp_format_lagid(const struct lacp_peerinfo *, const struct lacp_peerinfo *, char *, size_t); static const char *lacp_format_lagid_aggregator(const struct lacp_aggregator *, char *, size_t); static const char *lacp_format_state(uint8_t, char *, size_t); static const char *lacp_format_mac(const uint8_t *, char *, size_t); static const char *lacp_format_systemid(const struct lacp_systemid *, char *, size_t); static const char *lacp_format_portid(const struct lacp_portid *, char *, size_t); static void lacp_dprintf(const struct lacp_port *, const char *, ...) __attribute__((__format__(__printf__, 2, 3))); static VNET_DEFINE(int, lacp_debug); #define V_lacp_debug VNET(lacp_debug) SYSCTL_NODE(_net_link_lagg, OID_AUTO, lacp, CTLFLAG_RD, 0, "ieee802.3ad"); SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(lacp_debug), 0, "Enable LACP debug logging (1=debug, 2=trace)"); #define LACP_DPRINTF(a) if (V_lacp_debug & 0x01) { lacp_dprintf a ; } #define LACP_TRACE(a) if (V_lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); } #define LACP_TPRINTF(a) if (V_lacp_debug & 0x04) { lacp_dprintf a ; } /* * partner administration variables. * XXX should be configurable. */ static const struct lacp_peerinfo lacp_partner_admin_optimistic = { .lip_systemid = { .lsi_prio = 0xffff }, .lip_portid = { .lpi_prio = 0xffff }, .lip_state = LACP_STATE_SYNC | LACP_STATE_AGGREGATION | LACP_STATE_COLLECTING | LACP_STATE_DISTRIBUTING, }; static const struct lacp_peerinfo lacp_partner_admin_strict = { .lip_systemid = { .lsi_prio = 0xffff }, .lip_portid = { .lpi_prio = 0xffff }, .lip_state = 0, }; static const lacp_timer_func_t lacp_timer_funcs[LACP_NTIMER] = { [LACP_TIMER_CURRENT_WHILE] = lacp_sm_rx_timer, [LACP_TIMER_PERIODIC] = lacp_sm_ptx_timer, [LACP_TIMER_WAIT_WHILE] = lacp_sm_mux_timer, }; struct mbuf * lacp_input(struct lagg_port *lgp, struct mbuf *m) { struct lacp_port *lp = LACP_PORT(lgp); uint8_t subtype; if (m->m_pkthdr.len < sizeof(struct ether_header) + sizeof(subtype)) { m_freem(m); return (NULL); } m_copydata(m, sizeof(struct ether_header), sizeof(subtype), &subtype); switch (subtype) { case SLOWPROTOCOLS_SUBTYPE_LACP: lacp_pdu_input(lp, m); return (NULL); case SLOWPROTOCOLS_SUBTYPE_MARKER: lacp_marker_input(lp, m); return (NULL); } /* Not a subtype we are interested in */ return (m); } /* * lacp_pdu_input: process lacpdu */ static int lacp_pdu_input(struct lacp_port *lp, struct mbuf *m) { struct lacp_softc *lsc = lp->lp_lsc; struct lacpdu *du; int error = 0; if (m->m_pkthdr.len != sizeof(*du)) { goto bad; } if ((m->m_flags & M_MCAST) == 0) { goto bad; } if (m->m_len < sizeof(*du)) { m = m_pullup(m, sizeof(*du)); if (m == NULL) { return (ENOMEM); } } du = mtod(m, struct lacpdu *); if (memcmp(&du->ldu_eh.ether_dhost, ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { goto bad; } /* * ignore the version for compatibility with * the future protocol revisions. */ #if 0 if (du->ldu_sph.sph_version != 1) { goto bad; } #endif /* * ignore tlv types for compatibility with * the future protocol revisions. */ if (tlv_check(du, sizeof(*du), &du->ldu_tlv_actor, lacp_info_tlv_template, FALSE)) { goto bad; } if (V_lacp_debug > 0) { lacp_dprintf(lp, "lacpdu receive\n"); lacp_dump_lacpdu(du); } if ((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_rx_test) { LACP_TPRINTF((lp, "Dropping RX PDU\n")); goto bad; } LACP_LOCK(lsc); lacp_sm_rx(lp, du); LACP_UNLOCK(lsc); m_freem(m); return (error); bad: m_freem(m); return (EINVAL); } static void lacp_fill_actorinfo(struct lacp_port *lp, struct lacp_peerinfo *info) { struct lagg_port *lgp = lp->lp_lagg; struct lagg_softc *sc = lgp->lp_softc; info->lip_systemid.lsi_prio = htons(LACP_SYSTEM_PRIO); memcpy(&info->lip_systemid.lsi_mac, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); info->lip_portid.lpi_prio = htons(LACP_PORT_PRIO); info->lip_portid.lpi_portno = htons(lp->lp_ifp->if_index); info->lip_state = lp->lp_state; } static void lacp_fill_markerinfo(struct lacp_port *lp, struct lacp_markerinfo *info) { struct ifnet *ifp = lp->lp_ifp; /* Fill in the port index and system id (encoded as the MAC) */ info->mi_rq_port = htons(ifp->if_index); memcpy(&info->mi_rq_system, lp->lp_systemid.lsi_mac, ETHER_ADDR_LEN); info->mi_rq_xid = htonl(0); } static int lacp_xmit_lacpdu(struct lacp_port *lp) { struct lagg_port *lgp = lp->lp_lagg; struct mbuf *m; struct lacpdu *du; int error; LACP_LOCK_ASSERT(lp->lp_lsc); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { return (ENOMEM); } m->m_len = m->m_pkthdr.len = sizeof(*du); du = mtod(m, struct lacpdu *); memset(du, 0, sizeof(*du)); memcpy(&du->ldu_eh.ether_dhost, ethermulticastaddr_slowprotocols, ETHER_ADDR_LEN); memcpy(&du->ldu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN); du->ldu_eh.ether_type = htons(ETHERTYPE_SLOW); du->ldu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_LACP; du->ldu_sph.sph_version = 1; TLV_SET(&du->ldu_tlv_actor, LACP_TYPE_ACTORINFO, sizeof(du->ldu_actor)); du->ldu_actor = lp->lp_actor; TLV_SET(&du->ldu_tlv_partner, LACP_TYPE_PARTNERINFO, sizeof(du->ldu_partner)); du->ldu_partner = lp->lp_partner; TLV_SET(&du->ldu_tlv_collector, LACP_TYPE_COLLECTORINFO, sizeof(du->ldu_collector)); du->ldu_collector.lci_maxdelay = 0; if (V_lacp_debug > 0) { lacp_dprintf(lp, "lacpdu transmit\n"); lacp_dump_lacpdu(du); } m->m_flags |= M_MCAST; /* * XXX should use higher priority queue. * otherwise network congestion can break aggregation. */ error = lagg_enqueue(lp->lp_ifp, m); return (error); } static int lacp_xmit_marker(struct lacp_port *lp) { struct lagg_port *lgp = lp->lp_lagg; struct mbuf *m; struct markerdu *mdu; int error; LACP_LOCK_ASSERT(lp->lp_lsc); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { return (ENOMEM); } m->m_len = m->m_pkthdr.len = sizeof(*mdu); mdu = mtod(m, struct markerdu *); memset(mdu, 0, sizeof(*mdu)); memcpy(&mdu->mdu_eh.ether_dhost, ethermulticastaddr_slowprotocols, ETHER_ADDR_LEN); memcpy(&mdu->mdu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN); mdu->mdu_eh.ether_type = htons(ETHERTYPE_SLOW); mdu->mdu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_MARKER; mdu->mdu_sph.sph_version = 1; /* Bump the transaction id and copy over the marker info */ lp->lp_marker.mi_rq_xid = htonl(ntohl(lp->lp_marker.mi_rq_xid) + 1); TLV_SET(&mdu->mdu_tlv, MARKER_TYPE_INFO, sizeof(mdu->mdu_info)); mdu->mdu_info = lp->lp_marker; LACP_DPRINTF((lp, "marker transmit, port=%u, sys=%6D, id=%u\n", ntohs(mdu->mdu_info.mi_rq_port), mdu->mdu_info.mi_rq_system, ":", ntohl(mdu->mdu_info.mi_rq_xid))); m->m_flags |= M_MCAST; error = lagg_enqueue(lp->lp_ifp, m); return (error); } void lacp_linkstate(struct lagg_port *lgp) { struct lacp_port *lp = LACP_PORT(lgp); struct lacp_softc *lsc = lp->lp_lsc; struct ifnet *ifp = lgp->lp_ifp; struct ifmediareq ifmr; int error = 0; u_int media; uint8_t old_state; uint16_t old_key; bzero((char *)&ifmr, sizeof(ifmr)); error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); if (error != 0) return; LACP_LOCK(lsc); media = ifmr.ifm_active; LACP_DPRINTF((lp, "media changed 0x%x -> 0x%x, ether = %d, fdx = %d, " "link = %d\n", lp->lp_media, media, IFM_TYPE(media) == IFM_ETHER, (media & IFM_FDX) != 0, ifp->if_link_state == LINK_STATE_UP)); old_state = lp->lp_state; old_key = lp->lp_key; lp->lp_media = media; /* * If the port is not an active full duplex Ethernet link then it can * not be aggregated. */ if (IFM_TYPE(media) != IFM_ETHER || (media & IFM_FDX) == 0 || ifp->if_link_state != LINK_STATE_UP) { lacp_port_disable(lp); } else { lacp_port_enable(lp); } lp->lp_key = lacp_compose_key(lp); if (old_state != lp->lp_state || old_key != lp->lp_key) { LACP_DPRINTF((lp, "-> UNSELECTED\n")); lp->lp_selected = LACP_UNSELECTED; } LACP_UNLOCK(lsc); } static void lacp_tick(void *arg) { struct lacp_softc *lsc = arg; struct lacp_port *lp; LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) { if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) continue; CURVNET_SET(lp->lp_ifp->if_vnet); lacp_run_timers(lp); lacp_select(lp); lacp_sm_mux(lp); lacp_sm_tx(lp); lacp_sm_ptx_tx_schedule(lp); CURVNET_RESTORE(); } callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); } int lacp_port_create(struct lagg_port *lgp) { struct lagg_softc *sc = lgp->lp_softc; struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_port *lp; struct ifnet *ifp = lgp->lp_ifp; struct sockaddr_dl sdl; struct ifmultiaddr *rifma = NULL; int error; boolean_t active = TRUE; /* XXX should be configurable */ boolean_t fast = FALSE; /* XXX should be configurable */ link_init_sdl(ifp, (struct sockaddr *)&sdl, IFT_ETHER); sdl.sdl_alen = ETHER_ADDR_LEN; bcopy(ðermulticastaddr_slowprotocols, LLADDR(&sdl), ETHER_ADDR_LEN); error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); if (error) { printf("%s: ADDMULTI failed on %s\n", __func__, lgp->lp_ifp->if_xname); return (error); } lp = malloc(sizeof(struct lacp_port), M_DEVBUF, M_NOWAIT|M_ZERO); if (lp == NULL) return (ENOMEM); LACP_LOCK(lsc); lgp->lp_psc = lp; lp->lp_ifp = ifp; lp->lp_lagg = lgp; lp->lp_lsc = lsc; lp->lp_ifma = rifma; LIST_INSERT_HEAD(&lsc->lsc_ports, lp, lp_next); lacp_fill_actorinfo(lp, &lp->lp_actor); lacp_fill_markerinfo(lp, &lp->lp_marker); lp->lp_state = (active ? LACP_STATE_ACTIVITY : 0) | (fast ? LACP_STATE_TIMEOUT : 0); lp->lp_aggregator = NULL; lacp_sm_rx_set_expired(lp); LACP_UNLOCK(lsc); lacp_linkstate(lgp); return (0); } void lacp_port_destroy(struct lagg_port *lgp) { struct lacp_port *lp = LACP_PORT(lgp); struct lacp_softc *lsc = lp->lp_lsc; int i; LACP_LOCK(lsc); for (i = 0; i < LACP_NTIMER; i++) { LACP_TIMER_DISARM(lp, i); } lacp_disable_collecting(lp); lacp_disable_distributing(lp); lacp_unselect(lp); LIST_REMOVE(lp, lp_next); LACP_UNLOCK(lsc); /* The address may have already been removed by if_purgemaddrs() */ if (!lgp->lp_detaching) if_delmulti_ifma(lp->lp_ifma); free(lp, M_DEVBUF); } void lacp_req(struct lagg_softc *sc, void *data) { struct lacp_opreq *req = (struct lacp_opreq *)data; struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_aggregator *la; bzero(req, sizeof(struct lacp_opreq)); /* * If the LACP softc is NULL, return with the opreq structure full of * zeros. It is normal for the softc to be NULL while the lagg is * being destroyed. */ if (NULL == lsc) return; la = lsc->lsc_active_aggregator; LACP_LOCK(lsc); if (la != NULL) { req->actor_prio = ntohs(la->la_actor.lip_systemid.lsi_prio); memcpy(&req->actor_mac, &la->la_actor.lip_systemid.lsi_mac, ETHER_ADDR_LEN); req->actor_key = ntohs(la->la_actor.lip_key); req->actor_portprio = ntohs(la->la_actor.lip_portid.lpi_prio); req->actor_portno = ntohs(la->la_actor.lip_portid.lpi_portno); req->actor_state = la->la_actor.lip_state; req->partner_prio = ntohs(la->la_partner.lip_systemid.lsi_prio); memcpy(&req->partner_mac, &la->la_partner.lip_systemid.lsi_mac, ETHER_ADDR_LEN); req->partner_key = ntohs(la->la_partner.lip_key); req->partner_portprio = ntohs(la->la_partner.lip_portid.lpi_prio); req->partner_portno = ntohs(la->la_partner.lip_portid.lpi_portno); req->partner_state = la->la_partner.lip_state; } LACP_UNLOCK(lsc); } void lacp_portreq(struct lagg_port *lgp, void *data) { struct lacp_opreq *req = (struct lacp_opreq *)data; struct lacp_port *lp = LACP_PORT(lgp); struct lacp_softc *lsc = lp->lp_lsc; LACP_LOCK(lsc); req->actor_prio = ntohs(lp->lp_actor.lip_systemid.lsi_prio); memcpy(&req->actor_mac, &lp->lp_actor.lip_systemid.lsi_mac, ETHER_ADDR_LEN); req->actor_key = ntohs(lp->lp_actor.lip_key); req->actor_portprio = ntohs(lp->lp_actor.lip_portid.lpi_prio); req->actor_portno = ntohs(lp->lp_actor.lip_portid.lpi_portno); req->actor_state = lp->lp_actor.lip_state; req->partner_prio = ntohs(lp->lp_partner.lip_systemid.lsi_prio); memcpy(&req->partner_mac, &lp->lp_partner.lip_systemid.lsi_mac, ETHER_ADDR_LEN); req->partner_key = ntohs(lp->lp_partner.lip_key); req->partner_portprio = ntohs(lp->lp_partner.lip_portid.lpi_prio); req->partner_portno = ntohs(lp->lp_partner.lip_portid.lpi_portno); req->partner_state = lp->lp_partner.lip_state; LACP_UNLOCK(lsc); } static void lacp_disable_collecting(struct lacp_port *lp) { LACP_DPRINTF((lp, "collecting disabled\n")); lp->lp_state &= ~LACP_STATE_COLLECTING; } static void lacp_enable_collecting(struct lacp_port *lp) { LACP_DPRINTF((lp, "collecting enabled\n")); lp->lp_state |= LACP_STATE_COLLECTING; } static void lacp_disable_distributing(struct lacp_port *lp) { struct lacp_aggregator *la = lp->lp_aggregator; struct lacp_softc *lsc = lp->lp_lsc; struct lagg_softc *sc = lsc->lsc_softc; char buf[LACP_LAGIDSTR_MAX+1]; LACP_LOCK_ASSERT(lsc); if (la == NULL || (lp->lp_state & LACP_STATE_DISTRIBUTING) == 0) { return; } KASSERT(!TAILQ_EMPTY(&la->la_ports), ("no aggregator ports")); KASSERT(la->la_nports > 0, ("nports invalid (%d)", la->la_nports)); KASSERT(la->la_refcnt >= la->la_nports, ("aggregator refcnt invalid")); LACP_DPRINTF((lp, "disable distributing on aggregator %s, " "nports %d -> %d\n", lacp_format_lagid_aggregator(la, buf, sizeof(buf)), la->la_nports, la->la_nports - 1)); TAILQ_REMOVE(&la->la_ports, lp, lp_dist_q); la->la_nports--; sc->sc_active = la->la_nports; if (lsc->lsc_active_aggregator == la) { lacp_suppress_distributing(lsc, la); lacp_select_active_aggregator(lsc); /* regenerate the port map, the active aggregator has changed */ lacp_update_portmap(lsc); } lp->lp_state &= ~LACP_STATE_DISTRIBUTING; } static void lacp_enable_distributing(struct lacp_port *lp) { struct lacp_aggregator *la = lp->lp_aggregator; struct lacp_softc *lsc = lp->lp_lsc; struct lagg_softc *sc = lsc->lsc_softc; char buf[LACP_LAGIDSTR_MAX+1]; LACP_LOCK_ASSERT(lsc); if ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0) { return; } LACP_DPRINTF((lp, "enable distributing on aggregator %s, " "nports %d -> %d\n", lacp_format_lagid_aggregator(la, buf, sizeof(buf)), la->la_nports, la->la_nports + 1)); KASSERT(la->la_refcnt > la->la_nports, ("aggregator refcnt invalid")); TAILQ_INSERT_HEAD(&la->la_ports, lp, lp_dist_q); la->la_nports++; sc->sc_active = la->la_nports; lp->lp_state |= LACP_STATE_DISTRIBUTING; if (lsc->lsc_active_aggregator == la) { lacp_suppress_distributing(lsc, la); lacp_update_portmap(lsc); } else /* try to become the active aggregator */ lacp_select_active_aggregator(lsc); } static void lacp_transit_expire(void *vp) { struct lacp_softc *lsc = vp; LACP_LOCK_ASSERT(lsc); CURVNET_SET(lsc->lsc_softc->sc_ifp->if_vnet); LACP_TRACE(NULL); CURVNET_RESTORE(); lsc->lsc_suppress_distributing = FALSE; } void lacp_attach(struct lagg_softc *sc) { struct lacp_softc *lsc; lsc = malloc(sizeof(struct lacp_softc), M_DEVBUF, M_WAITOK | M_ZERO); sc->sc_psc = lsc; lsc->lsc_softc = sc; lsc->lsc_hashkey = arc4random(); lsc->lsc_active_aggregator = NULL; lsc->lsc_strict_mode = 1; LACP_LOCK_INIT(lsc); TAILQ_INIT(&lsc->lsc_aggregators); LIST_INIT(&lsc->lsc_ports); callout_init_mtx(&lsc->lsc_transit_callout, &lsc->lsc_mtx, 0); callout_init_mtx(&lsc->lsc_callout, &lsc->lsc_mtx, 0); /* if the lagg is already up then do the same */ if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) lacp_init(sc); } void lacp_detach(void *psc) { struct lacp_softc *lsc = (struct lacp_softc *)psc; KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators), ("aggregators still active")); KASSERT(lsc->lsc_active_aggregator == NULL, ("aggregator still attached")); callout_drain(&lsc->lsc_transit_callout); callout_drain(&lsc->lsc_callout); LACP_LOCK_DESTROY(lsc); free(lsc, M_DEVBUF); } void lacp_init(struct lagg_softc *sc) { struct lacp_softc *lsc = LACP_SOFTC(sc); LACP_LOCK(lsc); callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); LACP_UNLOCK(lsc); } void lacp_stop(struct lagg_softc *sc) { struct lacp_softc *lsc = LACP_SOFTC(sc); LACP_LOCK(lsc); callout_stop(&lsc->lsc_transit_callout); callout_stop(&lsc->lsc_callout); LACP_UNLOCK(lsc); } struct lagg_port * lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) { struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_portmap *pm; struct lacp_port *lp; uint32_t hash; if (__predict_false(lsc->lsc_suppress_distributing)) { LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); return (NULL); } pm = &lsc->lsc_pmap[lsc->lsc_activemap]; if (pm->pm_count == 0) { LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); return (NULL); } - if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && (m->m_flags & M_FLOWID)) + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && + M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) hash = m->m_pkthdr.flowid >> sc->flowid_shift; else hash = lagg_hashmbuf(sc, m, lsc->lsc_hashkey); hash %= pm->pm_count; lp = pm->pm_map[hash]; KASSERT((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0, ("aggregated port is not distributing")); return (lp->lp_lagg); } /* * lacp_suppress_distributing: drop transmit packets for a while * to preserve packet ordering. */ static void lacp_suppress_distributing(struct lacp_softc *lsc, struct lacp_aggregator *la) { struct lacp_port *lp; if (lsc->lsc_active_aggregator != la) { return; } LACP_TRACE(NULL); lsc->lsc_suppress_distributing = TRUE; /* send a marker frame down each port to verify the queues are empty */ LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) { lp->lp_flags |= LACP_PORT_MARK; lacp_xmit_marker(lp); } /* set a timeout for the marker frames */ callout_reset(&lsc->lsc_transit_callout, LACP_TRANSIT_DELAY * hz / 1000, lacp_transit_expire, lsc); } static int lacp_compare_peerinfo(const struct lacp_peerinfo *a, const struct lacp_peerinfo *b) { return (memcmp(a, b, offsetof(struct lacp_peerinfo, lip_state))); } static int lacp_compare_systemid(const struct lacp_systemid *a, const struct lacp_systemid *b) { return (memcmp(a, b, sizeof(*a))); } #if 0 /* unused */ static int lacp_compare_portid(const struct lacp_portid *a, const struct lacp_portid *b) { return (memcmp(a, b, sizeof(*a))); } #endif static uint64_t lacp_aggregator_bandwidth(struct lacp_aggregator *la) { struct lacp_port *lp; uint64_t speed; lp = TAILQ_FIRST(&la->la_ports); if (lp == NULL) { return (0); } speed = ifmedia_baudrate(lp->lp_media); speed *= la->la_nports; if (speed == 0) { LACP_DPRINTF((lp, "speed 0? media=0x%x nports=%d\n", lp->lp_media, la->la_nports)); } return (speed); } /* * lacp_select_active_aggregator: select an aggregator to be used to transmit * packets from lagg(4) interface. */ static void lacp_select_active_aggregator(struct lacp_softc *lsc) { struct lacp_aggregator *la; struct lacp_aggregator *best_la = NULL; uint64_t best_speed = 0; char buf[LACP_LAGIDSTR_MAX+1]; LACP_TRACE(NULL); TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { uint64_t speed; if (la->la_nports == 0) { continue; } speed = lacp_aggregator_bandwidth(la); LACP_DPRINTF((NULL, "%s, speed=%jd, nports=%d\n", lacp_format_lagid_aggregator(la, buf, sizeof(buf)), speed, la->la_nports)); /* * This aggregator is chosen if the partner has a better * system priority or, the total aggregated speed is higher * or, it is already the chosen aggregator */ if ((best_la != NULL && LACP_SYS_PRI(la->la_partner) < LACP_SYS_PRI(best_la->la_partner)) || speed > best_speed || (speed == best_speed && la == lsc->lsc_active_aggregator)) { best_la = la; best_speed = speed; } } KASSERT(best_la == NULL || best_la->la_nports > 0, ("invalid aggregator refcnt")); KASSERT(best_la == NULL || !TAILQ_EMPTY(&best_la->la_ports), ("invalid aggregator list")); if (lsc->lsc_active_aggregator != best_la) { LACP_DPRINTF((NULL, "active aggregator changed\n")); LACP_DPRINTF((NULL, "old %s\n", lacp_format_lagid_aggregator(lsc->lsc_active_aggregator, buf, sizeof(buf)))); } else { LACP_DPRINTF((NULL, "active aggregator not changed\n")); } LACP_DPRINTF((NULL, "new %s\n", lacp_format_lagid_aggregator(best_la, buf, sizeof(buf)))); if (lsc->lsc_active_aggregator != best_la) { lsc->lsc_active_aggregator = best_la; lacp_update_portmap(lsc); if (best_la) { lacp_suppress_distributing(lsc, best_la); } } } /* * Updated the inactive portmap array with the new list of ports and * make it live. */ static void lacp_update_portmap(struct lacp_softc *lsc) { struct lagg_softc *sc = lsc->lsc_softc; struct lacp_aggregator *la; struct lacp_portmap *p; struct lacp_port *lp; uint64_t speed; u_int newmap; int i; newmap = lsc->lsc_activemap == 0 ? 1 : 0; p = &lsc->lsc_pmap[newmap]; la = lsc->lsc_active_aggregator; speed = 0; bzero(p, sizeof(struct lacp_portmap)); if (la != NULL && la->la_nports > 0) { p->pm_count = la->la_nports; i = 0; TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) p->pm_map[i++] = lp; KASSERT(i == p->pm_count, ("Invalid port count")); speed = lacp_aggregator_bandwidth(la); } sc->sc_ifp->if_baudrate = speed; /* switch the active portmap over */ atomic_store_rel_int(&lsc->lsc_activemap, newmap); LACP_DPRINTF((NULL, "Set table %d with %d ports\n", lsc->lsc_activemap, lsc->lsc_pmap[lsc->lsc_activemap].pm_count)); } static uint16_t lacp_compose_key(struct lacp_port *lp) { struct lagg_port *lgp = lp->lp_lagg; struct lagg_softc *sc = lgp->lp_softc; u_int media = lp->lp_media; uint16_t key; if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) { /* * non-aggregatable links should have unique keys. * * XXX this isn't really unique as if_index is 16 bit. */ /* bit 0..14: (some bits of) if_index of this port */ key = lp->lp_ifp->if_index; /* bit 15: 1 */ key |= 0x8000; } else { u_int subtype = IFM_SUBTYPE(media); KASSERT(IFM_TYPE(media) == IFM_ETHER, ("invalid media type")); KASSERT((media & IFM_FDX) != 0, ("aggregating HDX interface")); /* bit 0..4: IFM_SUBTYPE modulo speed */ switch (subtype) { case IFM_10_T: case IFM_10_2: case IFM_10_5: case IFM_10_STP: case IFM_10_FL: key = IFM_10_T; break; case IFM_100_TX: case IFM_100_FX: case IFM_100_T4: case IFM_100_VG: case IFM_100_T2: key = IFM_100_TX; break; case IFM_1000_SX: case IFM_1000_LX: case IFM_1000_CX: case IFM_1000_T: key = IFM_1000_SX; break; case IFM_10G_LR: case IFM_10G_SR: case IFM_10G_CX4: case IFM_10G_TWINAX: case IFM_10G_TWINAX_LONG: case IFM_10G_LRM: case IFM_10G_T: key = IFM_10G_LR; break; case IFM_40G_CR4: case IFM_40G_SR4: case IFM_40G_LR4: key = IFM_40G_CR4; break; default: key = subtype; } /* bit 5..14: (some bits of) if_index of lagg device */ key |= 0x7fe0 & ((sc->sc_ifp->if_index) << 5); /* bit 15: 0 */ } return (htons(key)); } static void lacp_aggregator_addref(struct lacp_softc *lsc, struct lacp_aggregator *la) { char buf[LACP_LAGIDSTR_MAX+1]; LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", __func__, lacp_format_lagid(&la->la_actor, &la->la_partner, buf, sizeof(buf)), la->la_refcnt, la->la_refcnt + 1)); KASSERT(la->la_refcnt > 0, ("refcount <= 0")); la->la_refcnt++; KASSERT(la->la_refcnt > la->la_nports, ("invalid refcount")); } static void lacp_aggregator_delref(struct lacp_softc *lsc, struct lacp_aggregator *la) { char buf[LACP_LAGIDSTR_MAX+1]; LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", __func__, lacp_format_lagid(&la->la_actor, &la->la_partner, buf, sizeof(buf)), la->la_refcnt, la->la_refcnt - 1)); KASSERT(la->la_refcnt > la->la_nports, ("invalid refcnt")); la->la_refcnt--; if (la->la_refcnt > 0) { return; } KASSERT(la->la_refcnt == 0, ("refcount not zero")); KASSERT(lsc->lsc_active_aggregator != la, ("aggregator active")); TAILQ_REMOVE(&lsc->lsc_aggregators, la, la_q); free(la, M_DEVBUF); } /* * lacp_aggregator_get: allocate an aggregator. */ static struct lacp_aggregator * lacp_aggregator_get(struct lacp_softc *lsc, struct lacp_port *lp) { struct lacp_aggregator *la; la = malloc(sizeof(*la), M_DEVBUF, M_NOWAIT); if (la) { la->la_refcnt = 1; la->la_nports = 0; TAILQ_INIT(&la->la_ports); la->la_pending = 0; TAILQ_INSERT_TAIL(&lsc->lsc_aggregators, la, la_q); } return (la); } /* * lacp_fill_aggregator_id: setup a newly allocated aggregator from a port. */ static void lacp_fill_aggregator_id(struct lacp_aggregator *la, const struct lacp_port *lp) { lacp_fill_aggregator_id_peer(&la->la_partner, &lp->lp_partner); lacp_fill_aggregator_id_peer(&la->la_actor, &lp->lp_actor); la->la_actor.lip_state = lp->lp_state & LACP_STATE_AGGREGATION; } static void lacp_fill_aggregator_id_peer(struct lacp_peerinfo *lpi_aggr, const struct lacp_peerinfo *lpi_port) { memset(lpi_aggr, 0, sizeof(*lpi_aggr)); lpi_aggr->lip_systemid = lpi_port->lip_systemid; lpi_aggr->lip_key = lpi_port->lip_key; } /* * lacp_aggregator_is_compatible: check if a port can join to an aggregator. */ static int lacp_aggregator_is_compatible(const struct lacp_aggregator *la, const struct lacp_port *lp) { if (!(lp->lp_state & LACP_STATE_AGGREGATION) || !(lp->lp_partner.lip_state & LACP_STATE_AGGREGATION)) { return (0); } if (!(la->la_actor.lip_state & LACP_STATE_AGGREGATION)) { return (0); } if (!lacp_peerinfo_is_compatible(&la->la_partner, &lp->lp_partner)) { return (0); } if (!lacp_peerinfo_is_compatible(&la->la_actor, &lp->lp_actor)) { return (0); } return (1); } static int lacp_peerinfo_is_compatible(const struct lacp_peerinfo *a, const struct lacp_peerinfo *b) { if (memcmp(&a->lip_systemid, &b->lip_systemid, sizeof(a->lip_systemid))) { return (0); } if (memcmp(&a->lip_key, &b->lip_key, sizeof(a->lip_key))) { return (0); } return (1); } static void lacp_port_enable(struct lacp_port *lp) { lp->lp_state |= LACP_STATE_AGGREGATION; } static void lacp_port_disable(struct lacp_port *lp) { lacp_set_mux(lp, LACP_MUX_DETACHED); lp->lp_state &= ~LACP_STATE_AGGREGATION; lp->lp_selected = LACP_UNSELECTED; lacp_sm_rx_record_default(lp); lp->lp_partner.lip_state &= ~LACP_STATE_AGGREGATION; lp->lp_state &= ~LACP_STATE_EXPIRED; } /* * lacp_select: select an aggregator. create one if necessary. */ static void lacp_select(struct lacp_port *lp) { struct lacp_softc *lsc = lp->lp_lsc; struct lacp_aggregator *la; char buf[LACP_LAGIDSTR_MAX+1]; if (lp->lp_aggregator) { return; } KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), ("timer_wait_while still active")); LACP_DPRINTF((lp, "port lagid=%s\n", lacp_format_lagid(&lp->lp_actor, &lp->lp_partner, buf, sizeof(buf)))); TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { if (lacp_aggregator_is_compatible(la, lp)) { break; } } if (la == NULL) { la = lacp_aggregator_get(lsc, lp); if (la == NULL) { LACP_DPRINTF((lp, "aggregator creation failed\n")); /* * will retry on the next tick. */ return; } lacp_fill_aggregator_id(la, lp); LACP_DPRINTF((lp, "aggregator created\n")); } else { LACP_DPRINTF((lp, "compatible aggregator found\n")); if (la->la_refcnt == LACP_MAX_PORTS) return; lacp_aggregator_addref(lsc, la); } LACP_DPRINTF((lp, "aggregator lagid=%s\n", lacp_format_lagid(&la->la_actor, &la->la_partner, buf, sizeof(buf)))); lp->lp_aggregator = la; lp->lp_selected = LACP_SELECTED; } /* * lacp_unselect: finish unselect/detach process. */ static void lacp_unselect(struct lacp_port *lp) { struct lacp_softc *lsc = lp->lp_lsc; struct lacp_aggregator *la = lp->lp_aggregator; KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), ("timer_wait_while still active")); if (la == NULL) { return; } lp->lp_aggregator = NULL; lacp_aggregator_delref(lsc, la); } /* mux machine */ static void lacp_sm_mux(struct lacp_port *lp) { struct lagg_port *lgp = lp->lp_lagg; struct lagg_softc *sc = lgp->lp_softc; enum lacp_mux_state new_state; boolean_t p_sync = (lp->lp_partner.lip_state & LACP_STATE_SYNC) != 0; boolean_t p_collecting = (lp->lp_partner.lip_state & LACP_STATE_COLLECTING) != 0; enum lacp_selected selected = lp->lp_selected; struct lacp_aggregator *la; if (V_lacp_debug > 1) lacp_dprintf(lp, "%s: state= 0x%x, selected= 0x%x, " "p_sync= 0x%x, p_collecting= 0x%x\n", __func__, lp->lp_mux_state, selected, p_sync, p_collecting); re_eval: la = lp->lp_aggregator; KASSERT(lp->lp_mux_state == LACP_MUX_DETACHED || la != NULL, ("MUX not detached")); new_state = lp->lp_mux_state; switch (lp->lp_mux_state) { case LACP_MUX_DETACHED: if (selected != LACP_UNSELECTED) { new_state = LACP_MUX_WAITING; } break; case LACP_MUX_WAITING: KASSERT(la->la_pending > 0 || !LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), ("timer_wait_while still active")); if (selected == LACP_SELECTED && la->la_pending == 0) { new_state = LACP_MUX_ATTACHED; } else if (selected == LACP_UNSELECTED) { new_state = LACP_MUX_DETACHED; } break; case LACP_MUX_ATTACHED: if (selected == LACP_SELECTED && p_sync) { new_state = LACP_MUX_COLLECTING; } else if (selected != LACP_SELECTED) { new_state = LACP_MUX_DETACHED; } break; case LACP_MUX_COLLECTING: if (selected == LACP_SELECTED && p_sync && p_collecting) { new_state = LACP_MUX_DISTRIBUTING; } else if (selected != LACP_SELECTED || !p_sync) { new_state = LACP_MUX_ATTACHED; } break; case LACP_MUX_DISTRIBUTING: if (selected != LACP_SELECTED || !p_sync || !p_collecting) { new_state = LACP_MUX_COLLECTING; lacp_dprintf(lp, "Interface stopped DISTRIBUTING, possible flapping\n"); sc->sc_flapping++; } break; default: panic("%s: unknown state", __func__); } if (lp->lp_mux_state == new_state) { return; } lacp_set_mux(lp, new_state); goto re_eval; } static void lacp_set_mux(struct lacp_port *lp, enum lacp_mux_state new_state) { struct lacp_aggregator *la = lp->lp_aggregator; if (lp->lp_mux_state == new_state) { return; } switch (new_state) { case LACP_MUX_DETACHED: lp->lp_state &= ~LACP_STATE_SYNC; lacp_disable_distributing(lp); lacp_disable_collecting(lp); lacp_sm_assert_ntt(lp); /* cancel timer */ if (LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE)) { KASSERT(la->la_pending > 0, ("timer_wait_while not active")); la->la_pending--; } LACP_TIMER_DISARM(lp, LACP_TIMER_WAIT_WHILE); lacp_unselect(lp); break; case LACP_MUX_WAITING: LACP_TIMER_ARM(lp, LACP_TIMER_WAIT_WHILE, LACP_AGGREGATE_WAIT_TIME); la->la_pending++; break; case LACP_MUX_ATTACHED: lp->lp_state |= LACP_STATE_SYNC; lacp_disable_collecting(lp); lacp_sm_assert_ntt(lp); break; case LACP_MUX_COLLECTING: lacp_enable_collecting(lp); lacp_disable_distributing(lp); lacp_sm_assert_ntt(lp); break; case LACP_MUX_DISTRIBUTING: lacp_enable_distributing(lp); break; default: panic("%s: unknown state", __func__); } LACP_DPRINTF((lp, "mux_state %d -> %d\n", lp->lp_mux_state, new_state)); lp->lp_mux_state = new_state; } static void lacp_sm_mux_timer(struct lacp_port *lp) { struct lacp_aggregator *la = lp->lp_aggregator; char buf[LACP_LAGIDSTR_MAX+1]; KASSERT(la->la_pending > 0, ("no pending event")); LACP_DPRINTF((lp, "%s: aggregator %s, pending %d -> %d\n", __func__, lacp_format_lagid(&la->la_actor, &la->la_partner, buf, sizeof(buf)), la->la_pending, la->la_pending - 1)); la->la_pending--; } /* periodic transmit machine */ static void lacp_sm_ptx_update_timeout(struct lacp_port *lp, uint8_t oldpstate) { if (LACP_STATE_EQ(oldpstate, lp->lp_partner.lip_state, LACP_STATE_TIMEOUT)) { return; } LACP_DPRINTF((lp, "partner timeout changed\n")); /* * FAST_PERIODIC -> SLOW_PERIODIC * or * SLOW_PERIODIC (-> PERIODIC_TX) -> FAST_PERIODIC * * let lacp_sm_ptx_tx_schedule to update timeout. */ LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); /* * if timeout has been shortened, assert NTT. */ if ((lp->lp_partner.lip_state & LACP_STATE_TIMEOUT)) { lacp_sm_assert_ntt(lp); } } static void lacp_sm_ptx_tx_schedule(struct lacp_port *lp) { int timeout; if (!(lp->lp_state & LACP_STATE_ACTIVITY) && !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) { /* * NO_PERIODIC */ LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); return; } if (LACP_TIMER_ISARMED(lp, LACP_TIMER_PERIODIC)) { return; } timeout = (lp->lp_partner.lip_state & LACP_STATE_TIMEOUT) ? LACP_FAST_PERIODIC_TIME : LACP_SLOW_PERIODIC_TIME; LACP_TIMER_ARM(lp, LACP_TIMER_PERIODIC, timeout); } static void lacp_sm_ptx_timer(struct lacp_port *lp) { lacp_sm_assert_ntt(lp); } static void lacp_sm_rx(struct lacp_port *lp, const struct lacpdu *du) { int timeout; /* * check LACP_DISABLED first */ if (!(lp->lp_state & LACP_STATE_AGGREGATION)) { return; } /* * check loopback condition. */ if (!lacp_compare_systemid(&du->ldu_actor.lip_systemid, &lp->lp_actor.lip_systemid)) { return; } /* * EXPIRED, DEFAULTED, CURRENT -> CURRENT */ lacp_sm_rx_update_selected(lp, du); lacp_sm_rx_update_ntt(lp, du); lacp_sm_rx_record_pdu(lp, du); timeout = (lp->lp_state & LACP_STATE_TIMEOUT) ? LACP_SHORT_TIMEOUT_TIME : LACP_LONG_TIMEOUT_TIME; LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, timeout); lp->lp_state &= ~LACP_STATE_EXPIRED; /* * kick transmit machine without waiting the next tick. */ lacp_sm_tx(lp); } static void lacp_sm_rx_set_expired(struct lacp_port *lp) { lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; lp->lp_partner.lip_state |= LACP_STATE_TIMEOUT; LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, LACP_SHORT_TIMEOUT_TIME); lp->lp_state |= LACP_STATE_EXPIRED; } static void lacp_sm_rx_timer(struct lacp_port *lp) { if ((lp->lp_state & LACP_STATE_EXPIRED) == 0) { /* CURRENT -> EXPIRED */ LACP_DPRINTF((lp, "%s: CURRENT -> EXPIRED\n", __func__)); lacp_sm_rx_set_expired(lp); } else { /* EXPIRED -> DEFAULTED */ LACP_DPRINTF((lp, "%s: EXPIRED -> DEFAULTED\n", __func__)); lacp_sm_rx_update_default_selected(lp); lacp_sm_rx_record_default(lp); lp->lp_state &= ~LACP_STATE_EXPIRED; } } static void lacp_sm_rx_record_pdu(struct lacp_port *lp, const struct lacpdu *du) { boolean_t active; uint8_t oldpstate; char buf[LACP_STATESTR_MAX+1]; LACP_TRACE(lp); oldpstate = lp->lp_partner.lip_state; active = (du->ldu_actor.lip_state & LACP_STATE_ACTIVITY) || ((lp->lp_state & LACP_STATE_ACTIVITY) && (du->ldu_partner.lip_state & LACP_STATE_ACTIVITY)); lp->lp_partner = du->ldu_actor; if (active && ((LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, LACP_STATE_AGGREGATION) && !lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner)) || (du->ldu_partner.lip_state & LACP_STATE_AGGREGATION) == 0)) { /* XXX nothing? */ } else { lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; } lp->lp_state &= ~LACP_STATE_DEFAULTED; if (oldpstate != lp->lp_partner.lip_state) { LACP_DPRINTF((lp, "old pstate %s\n", lacp_format_state(oldpstate, buf, sizeof(buf)))); LACP_DPRINTF((lp, "new pstate %s\n", lacp_format_state(lp->lp_partner.lip_state, buf, sizeof(buf)))); } /* XXX Hack, still need to implement 5.4.9 para 2,3,4 */ if (lp->lp_lsc->lsc_strict_mode) lp->lp_partner.lip_state |= LACP_STATE_SYNC; lacp_sm_ptx_update_timeout(lp, oldpstate); } static void lacp_sm_rx_update_ntt(struct lacp_port *lp, const struct lacpdu *du) { LACP_TRACE(lp); if (lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner) || !LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, LACP_STATE_ACTIVITY | LACP_STATE_SYNC | LACP_STATE_AGGREGATION)) { LACP_DPRINTF((lp, "%s: assert ntt\n", __func__)); lacp_sm_assert_ntt(lp); } } static void lacp_sm_rx_record_default(struct lacp_port *lp) { uint8_t oldpstate; LACP_TRACE(lp); oldpstate = lp->lp_partner.lip_state; if (lp->lp_lsc->lsc_strict_mode) lp->lp_partner = lacp_partner_admin_strict; else lp->lp_partner = lacp_partner_admin_optimistic;; lp->lp_state |= LACP_STATE_DEFAULTED; lacp_sm_ptx_update_timeout(lp, oldpstate); } static void lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *lp, const struct lacp_peerinfo *info) { LACP_TRACE(lp); if (lacp_compare_peerinfo(&lp->lp_partner, info) || !LACP_STATE_EQ(lp->lp_partner.lip_state, info->lip_state, LACP_STATE_AGGREGATION)) { lp->lp_selected = LACP_UNSELECTED; /* mux machine will clean up lp->lp_aggregator */ } } static void lacp_sm_rx_update_selected(struct lacp_port *lp, const struct lacpdu *du) { LACP_TRACE(lp); lacp_sm_rx_update_selected_from_peerinfo(lp, &du->ldu_actor); } static void lacp_sm_rx_update_default_selected(struct lacp_port *lp) { LACP_TRACE(lp); if (lp->lp_lsc->lsc_strict_mode) lacp_sm_rx_update_selected_from_peerinfo(lp, &lacp_partner_admin_strict); else lacp_sm_rx_update_selected_from_peerinfo(lp, &lacp_partner_admin_optimistic); } /* transmit machine */ static void lacp_sm_tx(struct lacp_port *lp) { int error = 0; if (!(lp->lp_state & LACP_STATE_AGGREGATION) #if 1 || (!(lp->lp_state & LACP_STATE_ACTIVITY) && !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) #endif ) { lp->lp_flags &= ~LACP_PORT_NTT; } if (!(lp->lp_flags & LACP_PORT_NTT)) { return; } /* Rate limit to 3 PDUs per LACP_FAST_PERIODIC_TIME */ if (ppsratecheck(&lp->lp_last_lacpdu, &lp->lp_lacpdu_sent, (3 / LACP_FAST_PERIODIC_TIME)) == 0) { LACP_DPRINTF((lp, "rate limited pdu\n")); return; } if (((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_tx_test) == 0) { error = lacp_xmit_lacpdu(lp); } else { LACP_TPRINTF((lp, "Dropping TX PDU\n")); } if (error == 0) { lp->lp_flags &= ~LACP_PORT_NTT; } else { LACP_DPRINTF((lp, "lacpdu transmit failure, error %d\n", error)); } } static void lacp_sm_assert_ntt(struct lacp_port *lp) { lp->lp_flags |= LACP_PORT_NTT; } static void lacp_run_timers(struct lacp_port *lp) { int i; for (i = 0; i < LACP_NTIMER; i++) { KASSERT(lp->lp_timer[i] >= 0, ("invalid timer value %d", lp->lp_timer[i])); if (lp->lp_timer[i] == 0) { continue; } else if (--lp->lp_timer[i] <= 0) { if (lacp_timer_funcs[i]) { (*lacp_timer_funcs[i])(lp); } } } } int lacp_marker_input(struct lacp_port *lp, struct mbuf *m) { struct lacp_softc *lsc = lp->lp_lsc; struct lagg_port *lgp = lp->lp_lagg; struct lacp_port *lp2; struct markerdu *mdu; int error = 0; int pending = 0; if (m->m_pkthdr.len != sizeof(*mdu)) { goto bad; } if ((m->m_flags & M_MCAST) == 0) { goto bad; } if (m->m_len < sizeof(*mdu)) { m = m_pullup(m, sizeof(*mdu)); if (m == NULL) { return (ENOMEM); } } mdu = mtod(m, struct markerdu *); if (memcmp(&mdu->mdu_eh.ether_dhost, ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { goto bad; } if (mdu->mdu_sph.sph_version != 1) { goto bad; } switch (mdu->mdu_tlv.tlv_type) { case MARKER_TYPE_INFO: if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, marker_info_tlv_template, TRUE)) { goto bad; } mdu->mdu_tlv.tlv_type = MARKER_TYPE_RESPONSE; memcpy(&mdu->mdu_eh.ether_dhost, ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN); memcpy(&mdu->mdu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN); error = lagg_enqueue(lp->lp_ifp, m); break; case MARKER_TYPE_RESPONSE: if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, marker_response_tlv_template, TRUE)) { goto bad; } LACP_DPRINTF((lp, "marker response, port=%u, sys=%6D, id=%u\n", ntohs(mdu->mdu_info.mi_rq_port), mdu->mdu_info.mi_rq_system, ":", ntohl(mdu->mdu_info.mi_rq_xid))); /* Verify that it is the last marker we sent out */ if (memcmp(&mdu->mdu_info, &lp->lp_marker, sizeof(struct lacp_markerinfo))) goto bad; LACP_LOCK(lsc); lp->lp_flags &= ~LACP_PORT_MARK; if (lsc->lsc_suppress_distributing) { /* Check if any ports are waiting for a response */ LIST_FOREACH(lp2, &lsc->lsc_ports, lp_next) { if (lp2->lp_flags & LACP_PORT_MARK) { pending = 1; break; } } if (pending == 0) { /* All interface queues are clear */ LACP_DPRINTF((NULL, "queue flush complete\n")); lsc->lsc_suppress_distributing = FALSE; } } LACP_UNLOCK(lsc); m_freem(m); break; default: goto bad; } return (error); bad: LACP_DPRINTF((lp, "bad marker frame\n")); m_freem(m); return (EINVAL); } static int tlv_check(const void *p, size_t size, const struct tlvhdr *tlv, const struct tlv_template *tmpl, boolean_t check_type) { while (/* CONSTCOND */ 1) { if ((const char *)tlv - (const char *)p + sizeof(*tlv) > size) { return (EINVAL); } if ((check_type && tlv->tlv_type != tmpl->tmpl_type) || tlv->tlv_length != tmpl->tmpl_length) { return (EINVAL); } if (tmpl->tmpl_type == 0) { break; } tlv = (const struct tlvhdr *) ((const char *)tlv + tlv->tlv_length); tmpl++; } return (0); } /* Debugging */ const char * lacp_format_mac(const uint8_t *mac, char *buf, size_t buflen) { snprintf(buf, buflen, "%02X-%02X-%02X-%02X-%02X-%02X", (int)mac[0], (int)mac[1], (int)mac[2], (int)mac[3], (int)mac[4], (int)mac[5]); return (buf); } const char * lacp_format_systemid(const struct lacp_systemid *sysid, char *buf, size_t buflen) { char macbuf[LACP_MACSTR_MAX+1]; snprintf(buf, buflen, "%04X,%s", ntohs(sysid->lsi_prio), lacp_format_mac(sysid->lsi_mac, macbuf, sizeof(macbuf))); return (buf); } const char * lacp_format_portid(const struct lacp_portid *portid, char *buf, size_t buflen) { snprintf(buf, buflen, "%04X,%04X", ntohs(portid->lpi_prio), ntohs(portid->lpi_portno)); return (buf); } const char * lacp_format_partner(const struct lacp_peerinfo *peer, char *buf, size_t buflen) { char sysid[LACP_SYSTEMIDSTR_MAX+1]; char portid[LACP_PORTIDSTR_MAX+1]; snprintf(buf, buflen, "(%s,%04X,%s)", lacp_format_systemid(&peer->lip_systemid, sysid, sizeof(sysid)), ntohs(peer->lip_key), lacp_format_portid(&peer->lip_portid, portid, sizeof(portid))); return (buf); } const char * lacp_format_lagid(const struct lacp_peerinfo *a, const struct lacp_peerinfo *b, char *buf, size_t buflen) { char astr[LACP_PARTNERSTR_MAX+1]; char bstr[LACP_PARTNERSTR_MAX+1]; #if 0 /* * there's a convention to display small numbered peer * in the left. */ if (lacp_compare_peerinfo(a, b) > 0) { const struct lacp_peerinfo *t; t = a; a = b; b = t; } #endif snprintf(buf, buflen, "[%s,%s]", lacp_format_partner(a, astr, sizeof(astr)), lacp_format_partner(b, bstr, sizeof(bstr))); return (buf); } const char * lacp_format_lagid_aggregator(const struct lacp_aggregator *la, char *buf, size_t buflen) { if (la == NULL) { return ("(none)"); } return (lacp_format_lagid(&la->la_actor, &la->la_partner, buf, buflen)); } const char * lacp_format_state(uint8_t state, char *buf, size_t buflen) { snprintf(buf, buflen, "%b", state, LACP_STATE_BITS); return (buf); } static void lacp_dump_lacpdu(const struct lacpdu *du) { char buf[LACP_PARTNERSTR_MAX+1]; char buf2[LACP_STATESTR_MAX+1]; printf("actor=%s\n", lacp_format_partner(&du->ldu_actor, buf, sizeof(buf))); printf("actor.state=%s\n", lacp_format_state(du->ldu_actor.lip_state, buf2, sizeof(buf2))); printf("partner=%s\n", lacp_format_partner(&du->ldu_partner, buf, sizeof(buf))); printf("partner.state=%s\n", lacp_format_state(du->ldu_partner.lip_state, buf2, sizeof(buf2))); printf("maxdelay=%d\n", ntohs(du->ldu_collector.lci_maxdelay)); } static void lacp_dprintf(const struct lacp_port *lp, const char *fmt, ...) { va_list va; if (lp) { printf("%s: ", lp->lp_ifp->if_xname); } va_start(va, fmt); vprintf(fmt, va); va_end(va); } Index: head/sys/net/if_lagg.c =================================================================== --- head/sys/net/if_lagg.c (revision 275357) +++ head/sys/net/if_lagg.c (revision 275358) @@ -1,2269 +1,2270 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * Copyright (c) 2014 Marcelo Araujo * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) static VNET_DEFINE(struct mtx, lagg_list_mtx); #define V_lagg_list_mtx VNET(lagg_list_mtx) #define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ "if_lagg list", NULL, MTX_DEF) #define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) #define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static void lagg_lladdr(struct lagg_softc *, uint8_t *); static void lagg_capabilities(struct lagg_softc *); static void lagg_port_lladdr(struct lagg_port *, uint8_t *); static void lagg_port_setlladdr(void *, int); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_ether_setmulti(struct lagg_softc *); static int lagg_ether_cmdmulti(struct lagg_port *, int); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *); /* Simple round robin */ static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *); static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* Broadcast */ static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *); static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ static const struct lagg_proto { lagg_proto pr_num; void (*pr_attach)(struct lagg_softc *); void (*pr_detach)(struct lagg_softc *); int (*pr_start)(struct lagg_softc *, struct mbuf *); struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, struct mbuf *); int (*pr_addport)(struct lagg_port *); void (*pr_delport)(struct lagg_port *); void (*pr_linkstate)(struct lagg_port *); void (*pr_init)(struct lagg_softc *); void (*pr_stop)(struct lagg_softc *); void (*pr_lladdr)(struct lagg_softc *); void (*pr_request)(struct lagg_softc *, void *); void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { { .pr_num = LAGG_PROTO_NONE }, { .pr_num = LAGG_PROTO_ROUNDROBIN, .pr_attach = lagg_rr_attach, .pr_start = lagg_rr_start, .pr_input = lagg_rr_input, }, { .pr_num = LAGG_PROTO_FAILOVER, .pr_start = lagg_fail_start, .pr_input = lagg_fail_input, }, { .pr_num = LAGG_PROTO_LOADBALANCE, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, .pr_addport = lagg_lb_port_create, .pr_delport = lagg_lb_port_destroy, }, { .pr_num = LAGG_PROTO_LACP, .pr_attach = lagg_lacp_attach, .pr_detach = lagg_lacp_detach, .pr_start = lagg_lacp_start, .pr_input = lagg_lacp_input, .pr_addport = lacp_port_create, .pr_delport = lacp_port_destroy, .pr_linkstate = lacp_linkstate, .pr_init = lacp_init, .pr_stop = lacp_stop, .pr_lladdr = lagg_lacp_lladdr, .pr_request = lacp_req, .pr_portreq = lacp_portreq, }, { .pr_num = LAGG_PROTO_ETHERCHANNEL, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, }, { .pr_num = LAGG_PROTO_BROADCAST, .pr_start = lagg_bcast_start, .pr_input = lagg_bcast_input, }, }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); /* Allow input on any failover links */ static VNET_DEFINE(int, lagg_failover_rx_all); #define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); -/* Default value for using M_FLOWID */ +/* Default value for using flowid */ static VNET_DEFINE(int, def_use_flowid) = 1; #define V_def_use_flowid VNET(def_use_flowid) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); -/* Default value for using M_FLOWID */ +/* Default value for flowid shift */ static VNET_DEFINE(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); static void vnet_lagg_init(const void *unused __unused) { LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, lagg_clone_destroy, 0); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); static void vnet_lagg_uninit(const void *unused __unused) { if_clone_detach(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_uninit, NULL); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); lagg_input_p = NULL; lagg_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) { KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", __func__, sc)); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "using proto %u\n", pr); if (lagg_protos[pr].pr_attach != NULL) lagg_protos[pr].pr_attach(sc); sc->sc_proto = pr; } static void lagg_proto_detach(struct lagg_softc *sc) { lagg_proto pr; LAGG_WLOCK_ASSERT(sc); pr = sc->sc_proto; sc->sc_proto = LAGG_PROTO_NONE; if (lagg_protos[pr].pr_detach != NULL) lagg_protos[pr].pr_detach(sc); else LAGG_WUNLOCK(sc); } static int lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_start(sc, m)); } static struct mbuf * lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); } static int lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_addport == NULL) return (0); else return (lagg_protos[sc->sc_proto].pr_addport(lp)); } static void lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_delport != NULL) lagg_protos[sc->sc_proto].pr_delport(lp); } static void lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) lagg_protos[sc->sc_proto].pr_linkstate(lp); } static void lagg_proto_init(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_init != NULL) lagg_protos[sc->sc_proto].pr_init(sc); } static void lagg_proto_stop(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_stop != NULL) lagg_protos[sc->sc_proto].pr_stop(sc); } static void lagg_proto_lladdr(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) lagg_protos[sc->sc_proto].pr_lladdr(sc); } static void lagg_proto_request(struct lagg_softc *sc, void *v) { if (lagg_protos[sc->sc_proto].pr_request != NULL) lagg_protos[sc->sc_proto].pr_request(sc, v); } static void lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) { if (lagg_protos[sc->sc_proto].pr_portreq != NULL) lagg_protos[sc->sc_proto].pr_portreq(lp, v); } /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; struct rm_priotracker tracker; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(sc, &tracker); if (!SLIST_EMPTY(&sc->sc_ports)) { SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); } LAGG_RUNLOCK(sc, &tracker); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; struct rm_priotracker tracker; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(sc, &tracker); if (!SLIST_EMPTY(&sc->sc_ports)) { SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); } LAGG_RUNLOCK(sc, &tracker); } static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4; lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); LAGG_LOCK_INIT(sc); SLIST_INIT(&sc->sc_ports); TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc); /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG. */ ether_ifattach(ifp, eaddr); sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ LAGG_LIST_LOCK(); SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); LAGG_LIST_UNLOCK(); return (0); } static void lagg_clone_destroy(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_WLOCK(sc); lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ lagg_proto_detach(sc); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task); LAGG_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } static void lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port lp; if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) return; LAGG_WLOCK_ASSERT(sc); /* * Set the link layer address on the lagg interface. * lagg_proto_lladdr() notifies the MAC change to * the aggregation protocol. iflladdr_event handler which * may trigger gratuitous ARPs for INET will be handled in * a taskqueue. */ bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); bzero(&lp, sizeof(lp)); lp.lp_ifp = sc->sc_ifp; lp.lp_softc = sc; lagg_port_lladdr(&lp, lladdr); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap = ~0, ena = ~0; u_long hwa = ~0UL; struct ifnet_hw_tsomax hw_tsomax; LAGG_WLOCK_ASSERT(sc); memset(&hw_tsomax, 0, sizeof(hw_tsomax)); /* Get capabilities from the lagg ports */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; ena &= lp->lp_ifp->if_capenable; hwa &= lp->lp_ifp->if_hwassist; if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } cap = (cap == ~0 ? 0 : cap); ena = (ena == ~0 ? 0 : ena); hwa = (hwa == ~0 ? 0 : hwa); if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static void lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct lagg_llq *llq; int pending = 0; int primary; LAGG_WLOCK_ASSERT(sc); primary = (sc->sc_primary->lp_ifp == ifp) ? 1 : 0; if (primary == 0 && (lp->lp_detaching || memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)) return; /* Check to make sure its not already queued to be changed */ SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { if (llq->llq_ifp == ifp) { pending = 1; break; } } if (!pending) { llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT); if (llq == NULL) /* XXX what to do */ return; } /* Update the lladdr even if pending, it may have changed */ llq->llq_ifp = ifp; llq->llq_primary = primary; bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN); if (!pending) SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries); taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task); } /* * Set the interface MAC address from a taskqueue to avoid a LOR. */ static void lagg_port_setlladdr(void *arg, int pending) { struct lagg_softc *sc = (struct lagg_softc *)arg; struct lagg_llq *llq, *head; struct ifnet *ifp; int error; /* Grab a local reference of the queue and remove it from the softc */ LAGG_WLOCK(sc); head = SLIST_FIRST(&sc->sc_llq_head); SLIST_FIRST(&sc->sc_llq_head) = NULL; LAGG_WUNLOCK(sc); /* * Traverse the queue and set the lladdr on each ifp. It is safe to do * unlocked as we have the only reference to it. */ for (llq = head; llq != NULL; llq = head) { ifp = llq->llq_ifp; CURVNET_SET(ifp->if_vnet); if (llq->llq_primary == 0) { /* * Set the link layer address on the laggport interface. * if_setlladdr() triggers gratuitous ARPs for INET. */ error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN); if (error) printf("%s: setlladdr failed on %s\n", __func__, ifp->if_xname); } else EVENTHANDLER_INVOKE(iflladdr_event, ifp); CURVNET_RESTORE(); head = SLIST_NEXT(llq, llq_entries); free(llq, M_DEVBUF); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; int error, i; uint64_t *pval; LAGG_WLOCK_ASSERT(sc); /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER) return (EPROTONOSUPPORT); /* Allow the first Ethernet member to define the MTU */ if (SLIST_EMPTY(&sc->sc_ports)) sc->sc_ifp->if_mtu = ifp->if_mtu; else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (EINVAL); } if ((lp = malloc(sizeof(struct lagg_port), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) return (ENOMEM); /* Check if port is a stacked lagg */ LAGG_LIST_LOCK(); SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (E2BIG); } #endif } } LAGG_LIST_UNLOCK(); /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = IFT_IEEE8023ADLAG; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; lp->lp_ifp = ifp; lp->lp_softc = sc; /* Save port link layer address */ bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); if (SLIST_EMPTY(&sc->sc_ports)) { sc->sc_primary = lp; lagg_lladdr(sc, IF_LLADDR(ifp)); } else { /* Update link layer address for this port */ lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp)); } /* Insert into the list of ports. Keep ports sorted by if_index. */ SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( SLIST_NEXT(tlp, lp_entries) == NULL || SLIST_NEXT(tlp, lp_entries)->lp_ifp->if_index < ifp->if_index)) break; } if (tlp != NULL) SLIST_INSERT_AFTER(tlp, lp, lp_entries); else SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); /* Read port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) *pval = ifp->if_get_counter(ifp, i); /* Add multicast addresses and interface flags to this port */ lagg_ether_cmdmulti(lp, 1); lagg_setflags(lp, 1); if ((error = lagg_proto_addport(sc, lp)) != 0) { /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); return (error); } return (0); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_WLOCK_ASSERT(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static int lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr; struct lagg_llq *llq; struct ifnet *ifp = lp->lp_ifp; uint64_t *pval, vdiff; int i; LAGG_WLOCK_ASSERT(sc); if (rundelport) lagg_proto_delport(sc, lp); /* * Remove multicast addresses and interface flags from this port and * reset the MAC address, skip if the interface is being detached. */ if (!lp->lp_detaching) { lagg_ether_cmdmulti(lp, 0); lagg_setflags(lp, 0); lagg_port_lladdr(lp, lp->lp_lladdr); } /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Update detached port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) { vdiff = ifp->if_get_counter(ifp, i) - *pval; sc->detached_counters.val[i] += vdiff; } /* Finally, remove the port from the lagg */ SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[ETHER_ADDR_LEN]; if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) { bzero(&lladdr, ETHER_ADDR_LEN); } else { bcopy(lp_ptr->lp_lladdr, lladdr, ETHER_ADDR_LEN); } lagg_lladdr(sc, lladdr); sc->sc_primary = lp_ptr; /* Update link layer address for each port */ SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) lagg_port_lladdr(lp_ptr, lladdr); } /* Remove any pending lladdr changes from the queue */ if (lp->lp_detaching) { SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { if (llq->llq_ifp == ifp) { SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq, llq_entries); free(llq, M_DEVBUF); break; /* Only appears once */ } } } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); free(lp, M_DEVBUF); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; struct rm_priotracker tracker; /* Should be checked by the caller */ if (ifp->if_type != IFT_IEEE8023ADLAG || (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } LAGG_RLOCK(sc, &tracker); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(sc, &tracker); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(sc, &tracker); break; case SIOCSIFCAP: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_WLOCK(sc); lagg_capabilities(sc); LAGG_WUNLOCK(sc); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * Requests counter @cnt data. * * Counter value is calculated the following way: * 1) for each port, sum difference between current and "initial" measurements. * 2) add lagg logical interface counters. * 3) add data from detached_counters array. * * We also do the following things on ports attach/detach: * 1) On port attach we store all counters it has into port_counter array. * 2) On port detach we add the different between "initial" and * current counters data to detached_counters array. */ static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt) { struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lpifp; struct rm_priotracker tracker; uint64_t newval, oldval, vsum; /* Revise this when we've got non-generic counters. */ KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); sc = (struct lagg_softc *)ifp->if_softc; LAGG_RLOCK(sc, &tracker); vsum = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* Saved attached value */ oldval = lp->port_counters.val[cnt]; /* current value */ lpifp = lp->lp_ifp; newval = lpifp->if_get_counter(lpifp, cnt); /* Calculate diff and save new */ vsum += newval - oldval; } /* * Add counter data which might be added by upper * layer protocols operating on logical interface. */ vsum += if_get_counter_default(ifp, cnt); /* * Add counter data from detached ports counters */ vsum += sc->detached_counters.val[cnt]; LAGG_RUNLOCK(sc, &tracker); return (vsum); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_WLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_WUNLOCK(sc); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_ETHERCHANNEL: case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct lagg_port *lp; struct ifnet *ifp = sc->sc_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; LAGG_WLOCK(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update the port lladdrs */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_port_lladdr(lp, IF_LLADDR(ifp)); lagg_proto_init(sc); LAGG_WUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_WLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0; struct rm_priotracker tracker; bzero(&rpbuf, sizeof(rpbuf)); switch (cmd) { case SIOCGLAGG: LAGG_RLOCK(sc, &tracker); count = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) count++; buflen = count * sizeof(struct lagg_reqport); LAGG_RUNLOCK(sc, &tracker); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); LAGG_RLOCK(sc, &tracker); ra->ra_proto = sc->sc_proto; lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_RUNLOCK(sc, &tracker); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto < 1 || ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } LAGG_WLOCK(sc); lagg_proto_detach(sc); lagg_proto_attach(sc, ra->ra_proto); break; case SIOCGLAGGOPTS: ro->ro_opts = sc->sc_opts; if (sc->sc_proto == LAGG_PROTO_LACP) { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; if (lsc->lsc_debug.lsc_tx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_TXTEST; if (lsc->lsc_debug.lsc_rx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; ro->ro_active = sc->sc_active; } else { ro->ro_active = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; break; case SIOCSLAGGOPTS: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ro->ro_opts == 0) break; /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. */ int valid, lacp; switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_FLOWIDSHIFT: valid = 1; lacp = 0; break; case LAGG_OPT_LACP_TXTEST: case -LAGG_OPT_LACP_TXTEST: case LAGG_OPT_LACP_RXTEST: case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: valid = lacp = 1; break; default: valid = lacp = 0; break; } LAGG_WLOCK(sc); if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ error = EINVAL; LAGG_WUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } /* * Store new options into sc->sc_opts except for * FLOWIDSHIFT and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; switch (ro->ro_opts) { case LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 1; break; case -LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 0; break; case LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 1; break; case -LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 0; break; case LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 1; break; case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; } } LAGG_WUNLOCK(sc); break; case SIOCGLAGGFLAGS: rf->rf_flags = sc->sc_flags; break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_WLOCK(sc); sc->sc_flags &= ~LAGG_F_HASHMASK; sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK; LAGG_WUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_RLOCK(sc, &tracker); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(sc, &tracker); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(sc, &tracker); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit(rp->rp_portname)) == NULL) { error = EINVAL; break; } #ifdef INET6 /* * A laggport interface should not have inet6 address * because two interfaces with a valid link-local * scope zone must not be merged in any form. This * restriction is needed to prevent violation of * link-local scope zone. Attempts to add a laggport * interface which has inet6 addresses triggers * removal of all inet6 addresses on the member * interface. */ if (in6ifa_llaonifp(tpif)) { in6_ifdetach(tpif); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", tpif->if_xname); } #endif LAGG_WLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_WUNLOCK(sc); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_WLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_WUNLOCK(sc); break; } error = lagg_port_destroy(lp, 1); LAGG_WUNLOCK(sc); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_WLOCK(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } LAGG_WUNLOCK(sc); if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ LAGG_WLOCK(sc); lagg_stop(sc); LAGG_WUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ (*ifp->if_init)(sc); } break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_WLOCK(sc); error = lagg_ether_setmulti(sc); LAGG_WUNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: case SIOCSIFMTU: /* Do not allow the MTU or caps to be directly changed */ error = EINVAL; break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static int lagg_ether_setmulti(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_WLOCK_ASSERT(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* First, remove any existing filter entries. */ lagg_ether_cmdmulti(lp, 0); /* copy all addresses from the lagg interface to the port */ lagg_ether_cmdmulti(lp, 1); } return (0); } static int lagg_ether_cmdmulti(struct lagg_port *lp, int set) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; LAGG_WLOCK_ASSERT(sc); if (set) { IF_ADDR_WLOCK(scifp); TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } } else { while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && !lp->lp_detaching) if_delmulti_ifma(mc->mc_ifma); free(mc, M_DEVBUF); } } return (0); } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_WLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error, len, mcast; struct rm_priotracker tracker; len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; LAGG_RLOCK(sc, &tracker); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(sc, &tracker); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); LAGG_RUNLOCK(sc, &tracker); if (error != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct rm_priotracker tracker; LAGG_RLOCK(sc, &tracker); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (lp->lp_flags & LAGG_PORT_DISABLED) || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(sc, &tracker); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); m = (lp->lp_detaching == 0) ? lagg_proto_input(sc, lp, m) : NULL; if (m != NULL) { if (scifp->if_flags & IFF_MONITOR) { m_freem(m); m = NULL; } } LAGG_RUNLOCK(sc, &tracker); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; struct rm_priotracker tracker; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; LAGG_RLOCK(sc, &tracker); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } LAGG_RUNLOCK(sc, &tracker); } static void lagg_linkstate(struct lagg_softc *sc) { struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; /* Our link is considered up if at least one of our ports is active */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_ETHERCHANNEL: case LAGG_PROTO_BROADCAST: speed = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_WLOCK(sc); lagg_linkstate(sc); lagg_proto_linkstate(sc, lp); LAGG_WUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; // int new_link = LINK_STATE_DOWN; LAGG_RLOCK_ASSERT(sc); /* * Search a port which reports an active link state. */ if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } } found: if (rval != NULL) { /* * The IEEE 802.1D standard assumes that a lagg with * multiple ports is always full duplex. This is valid * for load sharing laggs and if at least two links * are active. Unfortunately, checking the latter would * be too expensive at this point. XXX if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) && (sc->sc_count > 1)) new_link = LINK_STATE_FULL_DUPLEX; else new_link = rval->lp_link_state; */ } return (rval); } static const void * lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) { if (m->m_pkthdr.len < (off + len)) { return (NULL); } else if (m->m_len < (off + len)) { m_copydata(m, off, len, buf); return (buf); } return (mtod(m, char *) + off); } uint32_t lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key) { uint16_t etype; uint32_t p = key; int off; struct ether_header *eh; const struct ether_vlan_header *vlan; #ifdef INET const struct ip *ip; const uint32_t *ports; int iphlen; #endif #ifdef INET6 const struct ip6_hdr *ip6; uint32_t flow; #endif union { #ifdef INET struct ip ip; #endif #ifdef INET6 struct ip6_hdr ip6; #endif struct ether_vlan_header vlan; uint32_t port; } buf; off = sizeof(*eh); if (m->m_len < off) goto out; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); if (sc->sc_flags & LAGG_F_HASHL2) { p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p); p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); } /* Special handling for encapsulating VLAN frames */ if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) { p = hash32_buf(&m->m_pkthdr.ether_vtag, sizeof(m->m_pkthdr.ether_vtag), p); } else if (etype == ETHERTYPE_VLAN) { vlan = lagg_gethdr(m, off, sizeof(*vlan), &buf); if (vlan == NULL) goto out; if (sc->sc_flags & LAGG_F_HASHL2) p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); etype = ntohs(vlan->evl_proto); off += sizeof(*vlan) - sizeof(*eh); } switch (etype) { #ifdef INET case ETHERTYPE_IP: ip = lagg_gethdr(m, off, sizeof(*ip), &buf); if (ip == NULL) goto out; if (sc->sc_flags & LAGG_F_HASHL3) { p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); } if (!(sc->sc_flags & LAGG_F_HASHL4)) break; switch (ip->ip_p) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: iphlen = ip->ip_hl << 2; if (iphlen < sizeof(*ip)) break; off += iphlen; ports = lagg_gethdr(m, off, sizeof(*ports), &buf); if (ports == NULL) break; p = hash32_buf(ports, sizeof(*ports), p); break; } break; #endif #ifdef INET6 case ETHERTYPE_IPV6: if (!(sc->sc_flags & LAGG_F_HASHL3)) break; ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf); if (ip6 == NULL) goto out; p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p); p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p); flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK; p = hash32_buf(&flow, sizeof(flow), p); /* IPv6 flow label */ break; #endif } out: return (p); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; sc->sc_seq = 0; } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; p = atomic_fetchadd_32(&sc->sc_seq, 1); p %= sc->sc_count; lp = SLIST_FIRST(&sc->sc_ports); while (p--) lp = SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Broadcast mode */ static int lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { int active_ports = 0; int errors = 0; int ret; struct lagg_port *lp, *last = NULL; struct mbuf *m0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; active_ports++; if (last != NULL) { m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m0 == NULL) { ret = ENOBUFS; errors++; break; } ret = lagg_enqueue(last->lp_ifp, m0); if (ret != 0) errors++; } last = lp; } if (last == NULL) { m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); if (ret != 0) errors++; if (errors == 0) return (ret); return (0); } static struct mbuf* lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've recieved a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO); sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; lb->lb_key = arc4random(); sc->sc_psc = lb; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); } static void lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb; lb = (struct lagg_lb *)sc->sc_psc; LAGG_WUNLOCK(sc); if (lb != NULL) free(lb, M_DEVBUF); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) return (EINVAL); if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } return (0); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; - if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && (m->m_flags & M_FLOWID)) + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && + M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = lagg_hashmbuf(sc, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; lacp_attach(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; void *psc; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); psc = sc->sc_psc; sc->sc_psc = NULL; LAGG_WUNLOCK(sc); lacp_detach(psc); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; /* purge all the lacp ports */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; lp = lacp_select_tx_port(sc, m); if (lp == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } Index: head/sys/net/if_lagg.h =================================================================== --- head/sys/net/if_lagg.h (revision 275357) +++ head/sys/net/if_lagg.h (revision 275358) @@ -1,288 +1,288 @@ /* $OpenBSD: if_trunk.h,v 1.11 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * $FreeBSD$ */ #ifndef _NET_LAGG_H #define _NET_LAGG_H /* * Global definitions */ #define LAGG_MAX_PORTS 32 /* logically */ #define LAGG_MAX_NAMESIZE 32 /* name of a protocol */ #define LAGG_MAX_STACKING 4 /* maximum number of stacked laggs */ /* Lagg flags */ #define LAGG_F_HASHL2 0x00000001 /* hash layer 2 */ #define LAGG_F_HASHL3 0x00000002 /* hash layer 3 */ #define LAGG_F_HASHL4 0x00000004 /* hash layer 4 */ #define LAGG_F_HASHMASK 0x00000007 /* Port flags */ #define LAGG_PORT_SLAVE 0x00000000 /* normal enslaved port */ #define LAGG_PORT_MASTER 0x00000001 /* primary port */ #define LAGG_PORT_STACK 0x00000002 /* stacked lagg port */ #define LAGG_PORT_ACTIVE 0x00000004 /* port is active */ #define LAGG_PORT_COLLECTING 0x00000008 /* port is receiving frames */ #define LAGG_PORT_DISTRIBUTING 0x00000010 /* port is sending frames */ #define LAGG_PORT_DISABLED 0x00000020 /* port is disabled */ #define LAGG_PORT_BITS "\20\01MASTER\02STACK\03ACTIVE\04COLLECTING" \ "\05DISTRIBUTING\06DISABLED" /* Supported lagg PROTOs */ typedef enum { LAGG_PROTO_NONE = 0, /* no lagg protocol defined */ LAGG_PROTO_ROUNDROBIN, /* simple round robin */ LAGG_PROTO_FAILOVER, /* active failover */ LAGG_PROTO_LOADBALANCE, /* loadbalance */ LAGG_PROTO_LACP, /* 802.3ad lacp */ LAGG_PROTO_ETHERCHANNEL,/* Cisco FEC */ LAGG_PROTO_BROADCAST, /* broadcast */ LAGG_PROTO_MAX, } lagg_proto; struct lagg_protos { const char *lpr_name; lagg_proto lpr_proto; }; #define LAGG_PROTO_DEFAULT LAGG_PROTO_FAILOVER #define LAGG_PROTOS { \ { "failover", LAGG_PROTO_FAILOVER }, \ { "fec", LAGG_PROTO_ETHERCHANNEL }, \ { "lacp", LAGG_PROTO_LACP }, \ { "loadbalance", LAGG_PROTO_LOADBALANCE }, \ { "roundrobin", LAGG_PROTO_ROUNDROBIN }, \ { "broadcast", LAGG_PROTO_BROADCAST }, \ { "none", LAGG_PROTO_NONE }, \ { "default", LAGG_PROTO_DEFAULT } \ } /* * lagg ioctls. */ /* * LACP current operational parameters structure. */ struct lacp_opreq { uint16_t actor_prio; uint8_t actor_mac[ETHER_ADDR_LEN]; uint16_t actor_key; uint16_t actor_portprio; uint16_t actor_portno; uint8_t actor_state; uint16_t partner_prio; uint8_t partner_mac[ETHER_ADDR_LEN]; uint16_t partner_key; uint16_t partner_portprio; uint16_t partner_portno; uint8_t partner_state; }; /* lagg port settings */ struct lagg_reqport { char rp_ifname[IFNAMSIZ]; /* name of the lagg */ char rp_portname[IFNAMSIZ]; /* name of the port */ u_int32_t rp_prio; /* port priority */ u_int32_t rp_flags; /* port flags */ union { struct lacp_opreq rpsc_lacp; } rp_psc; #define rp_lacpreq rp_psc.rpsc_lacp }; #define SIOCGLAGGPORT _IOWR('i', 140, struct lagg_reqport) #define SIOCSLAGGPORT _IOW('i', 141, struct lagg_reqport) #define SIOCSLAGGDELPORT _IOW('i', 142, struct lagg_reqport) /* lagg, ports and options */ struct lagg_reqall { char ra_ifname[IFNAMSIZ]; /* name of the lagg */ u_int ra_proto; /* lagg protocol */ size_t ra_size; /* size of buffer */ struct lagg_reqport *ra_port; /* allocated buffer */ int ra_ports; /* total port count */ union { struct lacp_opreq rpsc_lacp; } ra_psc; #define ra_lacpreq ra_psc.rpsc_lacp }; #define SIOCGLAGG _IOWR('i', 143, struct lagg_reqall) #define SIOCSLAGG _IOW('i', 144, struct lagg_reqall) struct lagg_reqflags { char rf_ifname[IFNAMSIZ]; /* name of the lagg */ uint32_t rf_flags; /* lagg protocol */ }; #define SIOCGLAGGFLAGS _IOWR('i', 145, struct lagg_reqflags) #define SIOCSLAGGHASH _IOW('i', 146, struct lagg_reqflags) struct lagg_reqopts { char ro_ifname[IFNAMSIZ]; /* name of the lagg */ int ro_opts; /* Option bitmap */ #define LAGG_OPT_NONE 0x00 -#define LAGG_OPT_USE_FLOWID 0x01 /* use M_FLOWID */ +#define LAGG_OPT_USE_FLOWID 0x01 /* enable use of flowid */ /* Pseudo flags which are used in ro_opts but not stored into sc_opts. */ -#define LAGG_OPT_FLOWIDSHIFT 0x02 /* Set flowid */ +#define LAGG_OPT_FLOWIDSHIFT 0x02 /* set flowid shift */ #define LAGG_OPT_FLOWIDSHIFT_MASK 0x1f /* flowid is uint32_t */ #define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */ #define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */ #define LAGG_OPT_LACP_RXTEST 0x40 /* LACP debug: rxtest */ u_int ro_count; /* number of ports */ u_int ro_active; /* active port count */ u_int ro_flapping; /* number of flapping */ int ro_flowid_shift; /* shift the flowid */ }; #define SIOCGLAGGOPTS _IOWR('i', 152, struct lagg_reqopts) #define SIOCSLAGGOPTS _IOW('i', 153, struct lagg_reqopts) #define LAGG_OPT_BITS "\020\001USE_FLOWID\005LACP_STRICT" \ "\006LACP_TXTEST\007LACP_RXTEST" #ifdef _KERNEL /* * Internal kernel part */ #define LAGG_PORTACTIVE(_tp) ( \ ((_tp)->lp_ifp->if_link_state == LINK_STATE_UP) && \ ((_tp)->lp_ifp->if_flags & IFF_UP) \ ) struct lagg_ifreq { union { struct ifreq ifreq; struct { char ifr_name[IFNAMSIZ]; struct sockaddr_storage ifr_ss; } ifreq_storage; } ifreq; }; #define sc_ifflags sc_ifp->if_flags /* flags */ #define sc_ifname sc_ifp->if_xname /* name */ #define sc_capabilities sc_ifp->if_capabilities /* capabilities */ #define IFCAP_LAGG_MASK 0xffff0000 /* private capabilities */ #define IFCAP_LAGG_FULLDUPLEX 0x00010000 /* full duplex with >1 ports */ /* Private data used by the loadbalancing protocol */ struct lagg_lb { u_int32_t lb_key; struct lagg_port *lb_ports[LAGG_MAX_PORTS]; }; struct lagg_mc { struct sockaddr_dl mc_addr; struct ifmultiaddr *mc_ifma; SLIST_ENTRY(lagg_mc) mc_entries; }; /* List of interfaces to have the MAC address modified */ struct lagg_llq { struct ifnet *llq_ifp; uint8_t llq_lladdr[ETHER_ADDR_LEN]; uint8_t llq_primary; SLIST_ENTRY(lagg_llq) llq_entries; }; struct lagg_counters { uint64_t val[IFCOUNTERS]; }; struct lagg_softc { struct ifnet *sc_ifp; /* virtual interface */ struct rmlock sc_mtx; int sc_proto; /* lagg protocol */ u_int sc_count; /* number of ports */ u_int sc_active; /* active port count */ u_int sc_flapping; /* number of flapping * events */ struct lagg_port *sc_primary; /* primary port */ struct ifmedia sc_media; /* media config */ void *sc_psc; /* protocol data */ uint32_t sc_seq; /* sequence counter */ uint32_t sc_flags; SLIST_HEAD(__tplhd, lagg_port) sc_ports; /* list of interfaces */ SLIST_ENTRY(lagg_softc) sc_entries; struct task sc_lladdr_task; SLIST_HEAD(__llqhd, lagg_llq) sc_llq_head; /* interfaces to program the lladdr on */ eventhandler_tag vlan_attach; eventhandler_tag vlan_detach; struct callout sc_callout; u_int sc_opts; int flowid_shift; /* shift the flowid */ struct lagg_counters detached_counters; /* detached ports sum */ }; struct lagg_port { struct ifnet *lp_ifp; /* physical interface */ struct lagg_softc *lp_softc; /* parent lagg */ uint8_t lp_lladdr[ETHER_ADDR_LEN]; u_char lp_iftype; /* interface type */ uint32_t lp_prio; /* port priority */ uint32_t lp_flags; /* port flags */ int lp_ifflags; /* saved ifp flags */ void *lh_cookie; /* if state hook */ void *lp_psc; /* protocol data */ int lp_detaching; /* ifnet is detaching */ SLIST_HEAD(__mclhd, lagg_mc) lp_mc_head; /* multicast addresses */ /* Redirected callbacks */ int (*lp_ioctl)(struct ifnet *, u_long, caddr_t); int (*lp_output)(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); struct lagg_counters port_counters; /* ifp counters copy */ SLIST_ENTRY(lagg_port) lp_entries; }; #define LAGG_LOCK_INIT(_sc) rm_init(&(_sc)->sc_mtx, "if_lagg rmlock") #define LAGG_LOCK_DESTROY(_sc) rm_destroy(&(_sc)->sc_mtx) #define LAGG_RLOCK(_sc, _p) rm_rlock(&(_sc)->sc_mtx, (_p)) #define LAGG_WLOCK(_sc) rm_wlock(&(_sc)->sc_mtx) #define LAGG_RUNLOCK(_sc, _p) rm_runlock(&(_sc)->sc_mtx, (_p)) #define LAGG_WUNLOCK(_sc) rm_wunlock(&(_sc)->sc_mtx) #define LAGG_RLOCK_ASSERT(_sc) rm_assert(&(_sc)->sc_mtx, RA_RLOCKED) #define LAGG_WLOCK_ASSERT(_sc) rm_assert(&(_sc)->sc_mtx, RA_WLOCKED) extern struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); extern void (*lagg_linkstate_p)(struct ifnet *, int ); int lagg_enqueue(struct ifnet *, struct mbuf *); uint32_t lagg_hashmbuf(struct lagg_softc *, struct mbuf *, uint32_t); SYSCTL_DECL(_net_link_lagg); #endif /* _KERNEL */ #endif /* _NET_LAGG_H */ Index: head/sys/net/if_vxlan.c =================================================================== --- head/sys/net/if_vxlan.c (revision 275357) +++ head/sys/net/if_vxlan.c (revision 275358) @@ -1,3089 +1,3090 @@ /*- * Copyright (c) 2014, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vxlan_softc; LIST_HEAD(vxlan_softc_head, vxlan_softc); struct vxlan_socket_mc_info { union vxlan_sockaddr vxlsomc_saddr; union vxlan_sockaddr vxlsomc_gaddr; int vxlsomc_ifidx; int vxlsomc_users; }; #define VXLAN_SO_MC_MAX_GROUPS 32 #define VXLAN_SO_VNI_HASH_SHIFT 6 #define VXLAN_SO_VNI_HASH_SIZE (1 << VXLAN_SO_VNI_HASH_SHIFT) #define VXLAN_SO_VNI_HASH(_vni) ((_vni) % VXLAN_SO_VNI_HASH_SIZE) struct vxlan_socket { struct socket *vxlso_sock; struct rmlock vxlso_lock; u_int vxlso_refcnt; union vxlan_sockaddr vxlso_laddr; LIST_ENTRY(vxlan_socket) vxlso_entry; struct vxlan_softc_head vxlso_vni_hash[VXLAN_SO_VNI_HASH_SIZE]; struct vxlan_socket_mc_info vxlso_mc[VXLAN_SO_MC_MAX_GROUPS]; }; #define VXLAN_SO_RLOCK(_vso, _p) rm_rlock(&(_vso)->vxlso_lock, (_p)) #define VXLAN_SO_RUNLOCK(_vso, _p) rm_runlock(&(_vso)->vxlso_lock, (_p)) #define VXLAN_SO_WLOCK(_vso) rm_wlock(&(_vso)->vxlso_lock) #define VXLAN_SO_WUNLOCK(_vso) rm_wunlock(&(_vso)->vxlso_lock) #define VXLAN_SO_LOCK_ASSERT(_vso) \ rm_assert(&(_vso)->vxlso_lock, RA_LOCKED) #define VXLAN_SO_LOCK_WASSERT(_vso) \ rm_assert(&(_vso)->vxlso_lock, RA_WLOCKED) #define VXLAN_SO_ACQUIRE(_vso) refcount_acquire(&(_vso)->vxlso_refcnt) #define VXLAN_SO_RELEASE(_vso) refcount_release(&(_vso)->vxlso_refcnt) struct vxlan_ftable_entry { LIST_ENTRY(vxlan_ftable_entry) vxlfe_hash; uint16_t vxlfe_flags; uint8_t vxlfe_mac[ETHER_ADDR_LEN]; union vxlan_sockaddr vxlfe_raddr; time_t vxlfe_expire; }; #define VXLAN_FE_FLAG_DYNAMIC 0x01 #define VXLAN_FE_FLAG_STATIC 0x02 #define VXLAN_FE_IS_DYNAMIC(_fe) \ ((_fe)->vxlfe_flags & VXLAN_FE_FLAG_DYNAMIC) #define VXLAN_SC_FTABLE_SHIFT 9 #define VXLAN_SC_FTABLE_SIZE (1 << VXLAN_SC_FTABLE_SHIFT) #define VXLAN_SC_FTABLE_MASK (VXLAN_SC_FTABLE_SIZE - 1) #define VXLAN_SC_FTABLE_HASH(_sc, _mac) \ (vxlan_mac_hash(_sc, _mac) % VXLAN_SC_FTABLE_SIZE) LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry); struct vxlan_statistics { uint32_t ftable_nospace; uint32_t ftable_lock_upgrade_failed; }; struct vxlan_softc { struct ifnet *vxl_ifp; struct vxlan_socket *vxl_sock; uint32_t vxl_vni; union vxlan_sockaddr vxl_src_addr; union vxlan_sockaddr vxl_dst_addr; uint32_t vxl_flags; #define VXLAN_FLAG_INIT 0x0001 #define VXLAN_FLAG_TEARDOWN 0x0002 #define VXLAN_FLAG_LEARN 0x0004 uint32_t vxl_port_hash_key; uint16_t vxl_min_port; uint16_t vxl_max_port; uint8_t vxl_ttl; /* Lookup table from MAC address to forwarding entry. */ uint32_t vxl_ftable_cnt; uint32_t vxl_ftable_max; uint32_t vxl_ftable_timeout; uint32_t vxl_ftable_hash_key; struct vxlan_ftable_head *vxl_ftable; /* Derived from vxl_dst_addr. */ struct vxlan_ftable_entry vxl_default_fe; struct ip_moptions *vxl_im4o; struct ip6_moptions *vxl_im6o; struct rmlock vxl_lock; volatile u_int vxl_refcnt; int vxl_unit; int vxl_vso_mc_index; struct vxlan_statistics vxl_stats; struct sysctl_oid *vxl_sysctl_node; struct sysctl_ctx_list vxl_sysctl_ctx; struct callout vxl_callout; uint8_t vxl_hwaddr[ETHER_ADDR_LEN]; int vxl_mc_ifindex; struct ifnet *vxl_mc_ifp; char vxl_mc_ifname[IFNAMSIZ]; LIST_ENTRY(vxlan_softc) vxl_entry; LIST_ENTRY(vxlan_softc) vxl_ifdetach_list; }; #define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p)) #define VXLAN_RUNLOCK(_sc, _p) rm_runlock(&(_sc)->vxl_lock, (_p)) #define VXLAN_WLOCK(_sc) rm_wlock(&(_sc)->vxl_lock) #define VXLAN_WUNLOCK(_sc) rm_wunlock(&(_sc)->vxl_lock) #define VXLAN_LOCK_WOWNED(_sc) rm_wowned(&(_sc)->vxl_lock) #define VXLAN_LOCK_ASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_LOCKED) #define VXLAN_LOCK_WASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_WLOCKED) #define VXLAN_UNLOCK(_sc, _p) do { \ if (VXLAN_LOCK_WOWNED(_sc)) \ VXLAN_WUNLOCK(_sc); \ else \ VXLAN_RUNLOCK(_sc, _p); \ } while (0) #define VXLAN_ACQUIRE(_sc) refcount_acquire(&(_sc)->vxl_refcnt) #define VXLAN_RELEASE(_sc) refcount_release(&(_sc)->vxl_refcnt) #define satoconstsin(sa) ((const struct sockaddr_in *)(sa)) #define satoconstsin6(sa) ((const struct sockaddr_in6 *)(sa)) struct vxlanudphdr { struct udphdr vxlh_udp; struct vxlan_header vxlh_hdr; } __packed; static int vxlan_ftable_addr_cmp(const uint8_t *, const uint8_t *); static void vxlan_ftable_init(struct vxlan_softc *); static void vxlan_ftable_fini(struct vxlan_softc *); static void vxlan_ftable_flush(struct vxlan_softc *, int); static void vxlan_ftable_expire(struct vxlan_softc *); static int vxlan_ftable_update_locked(struct vxlan_softc *, const struct sockaddr *, const uint8_t *, struct rm_priotracker *); static int vxlan_ftable_update(struct vxlan_softc *, const struct sockaddr *, const uint8_t *); static int vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS); static struct vxlan_ftable_entry * vxlan_ftable_entry_alloc(void); static void vxlan_ftable_entry_free(struct vxlan_ftable_entry *); static void vxlan_ftable_entry_init(struct vxlan_softc *, struct vxlan_ftable_entry *, const uint8_t *, const struct sockaddr *, uint32_t); static void vxlan_ftable_entry_destroy(struct vxlan_softc *, struct vxlan_ftable_entry *); static int vxlan_ftable_entry_insert(struct vxlan_softc *, struct vxlan_ftable_entry *); static struct vxlan_ftable_entry * vxlan_ftable_entry_lookup(struct vxlan_softc *, const uint8_t *); static void vxlan_ftable_entry_dump(struct vxlan_ftable_entry *, struct sbuf *); static struct vxlan_socket * vxlan_socket_alloc(const union vxlan_sockaddr *); static void vxlan_socket_destroy(struct vxlan_socket *); static void vxlan_socket_release(struct vxlan_socket *); static struct vxlan_socket * vxlan_socket_lookup(union vxlan_sockaddr *vxlsa); static void vxlan_socket_insert(struct vxlan_socket *); static int vxlan_socket_init(struct vxlan_socket *, struct ifnet *); static int vxlan_socket_bind(struct vxlan_socket *, struct ifnet *); static int vxlan_socket_create(struct ifnet *, int, const union vxlan_sockaddr *, struct vxlan_socket **); static void vxlan_socket_ifdetach(struct vxlan_socket *, struct ifnet *, struct vxlan_softc_head *); static struct vxlan_socket * vxlan_socket_mc_lookup(const union vxlan_sockaddr *); static int vxlan_sockaddr_mc_info_match( const struct vxlan_socket_mc_info *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int); static int vxlan_socket_mc_join_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int *, union vxlan_sockaddr *); static int vxlan_socket_mc_leave_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int); static int vxlan_socket_mc_add_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int, int *); static void vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *, int); static struct vxlan_softc * vxlan_socket_lookup_softc_locked(struct vxlan_socket *, uint32_t); static struct vxlan_softc * vxlan_socket_lookup_softc(struct vxlan_socket *, uint32_t); static int vxlan_socket_insert_softc(struct vxlan_socket *, struct vxlan_softc *); static void vxlan_socket_remove_softc(struct vxlan_socket *, struct vxlan_softc *); static struct ifnet * vxlan_multicast_if_ref(struct vxlan_softc *, int); static void vxlan_free_multicast(struct vxlan_softc *); static int vxlan_setup_multicast_interface(struct vxlan_softc *); static int vxlan_setup_multicast(struct vxlan_softc *); static int vxlan_setup_socket(struct vxlan_softc *); static void vxlan_setup_interface(struct vxlan_softc *); static int vxlan_valid_init_config(struct vxlan_softc *); static void vxlan_init_wait(struct vxlan_softc *); static void vxlan_init_complete(struct vxlan_softc *); static void vxlan_init(void *); static void vxlan_release(struct vxlan_softc *); static void vxlan_teardown_wait(struct vxlan_softc *); static void vxlan_teardown_complete(struct vxlan_softc *); static void vxlan_teardown_locked(struct vxlan_softc *); static void vxlan_teardown(struct vxlan_softc *); static void vxlan_ifdetach(struct vxlan_softc *, struct ifnet *, struct vxlan_softc_head *); static void vxlan_timer(void *); static int vxlan_ctrl_get_config(struct vxlan_softc *, void *); static int vxlan_ctrl_set_vni(struct vxlan_softc *, void *); static int vxlan_ctrl_set_local_addr(struct vxlan_softc *, void *); static int vxlan_ctrl_set_remote_addr(struct vxlan_softc *, void *); static int vxlan_ctrl_set_local_port(struct vxlan_softc *, void *); static int vxlan_ctrl_set_remote_port(struct vxlan_softc *, void *); static int vxlan_ctrl_set_port_range(struct vxlan_softc *, void *); static int vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *, void *); static int vxlan_ctrl_set_ftable_max(struct vxlan_softc *, void *); static int vxlan_ctrl_set_multicast_if(struct vxlan_softc * , void *); static int vxlan_ctrl_set_ttl(struct vxlan_softc *, void *); static int vxlan_ctrl_set_learn(struct vxlan_softc *, void *); static int vxlan_ctrl_ftable_entry_add(struct vxlan_softc *, void *); static int vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *, void *); static int vxlan_ctrl_flush(struct vxlan_softc *, void *); static int vxlan_ioctl_drvspec(struct vxlan_softc *, struct ifdrv *, int); static int vxlan_ioctl_ifflags(struct vxlan_softc *); static int vxlan_ioctl(struct ifnet *, u_long, caddr_t); #if defined(INET) || defined(INET6) static uint16_t vxlan_pick_source_port(struct vxlan_softc *, struct mbuf *); static void vxlan_encap_header(struct vxlan_softc *, struct mbuf *, int, uint16_t, uint16_t); #endif static int vxlan_encap4(struct vxlan_softc *, const union vxlan_sockaddr *, struct mbuf *); static int vxlan_encap6(struct vxlan_softc *, const union vxlan_sockaddr *, struct mbuf *); static int vxlan_transmit(struct ifnet *, struct mbuf *); static void vxlan_qflush(struct ifnet *); static void vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **, const struct sockaddr *); static void vxlan_set_default_config(struct vxlan_softc *); static int vxlan_set_user_config(struct vxlan_softc *, struct ifvxlanparam *); static int vxlan_clone_create(struct if_clone *, int, caddr_t); static void vxlan_clone_destroy(struct ifnet *); static uint32_t vxlan_mac_hash(struct vxlan_softc *, const uint8_t *); static void vxlan_fakeaddr(struct vxlan_softc *); static int vxlan_sockaddr_cmp(const union vxlan_sockaddr *, const struct sockaddr *); static void vxlan_sockaddr_copy(union vxlan_sockaddr *, const struct sockaddr *); static int vxlan_sockaddr_in_equal(const union vxlan_sockaddr *, const struct sockaddr *); static void vxlan_sockaddr_in_copy(union vxlan_sockaddr *, const struct sockaddr *); static int vxlan_sockaddr_supported(const union vxlan_sockaddr *, int); static int vxlan_sockaddr_in_any(const union vxlan_sockaddr *); static int vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *); static int vxlan_can_change_config(struct vxlan_softc *); static int vxlan_check_vni(uint32_t); static int vxlan_check_ttl(int); static int vxlan_check_ftable_timeout(uint32_t); static int vxlan_check_ftable_max(uint32_t); static void vxlan_sysctl_setup(struct vxlan_softc *); static void vxlan_sysctl_destroy(struct vxlan_softc *); static int vxlan_tunable_int(struct vxlan_softc *, const char *, int); static void vxlan_ifdetach_event(void *, struct ifnet *); static void vxlan_load(void); static void vxlan_unload(void); static int vxlan_modevent(module_t, int, void *); static const char vxlan_name[] = "vxlan"; static MALLOC_DEFINE(M_VXLAN, vxlan_name, "Virtual eXtensible LAN Interface"); static struct if_clone *vxlan_cloner; static struct mtx vxlan_list_mtx; static LIST_HEAD(, vxlan_socket) vxlan_socket_list; static eventhandler_tag vxlan_ifdetach_event_tag; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, vxlan, CTLFLAG_RW, 0, "Virtual eXtensible Local Area Network"); static int vxlan_legacy_port = 0; TUNABLE_INT("net.link.vxlan.legacy_port", &vxlan_legacy_port); static int vxlan_reuse_port = 0; TUNABLE_INT("net.link.vxlan.reuse_port", &vxlan_reuse_port); /* Default maximum number of addresses in the forwarding table. */ #ifndef VXLAN_FTABLE_MAX #define VXLAN_FTABLE_MAX 2000 #endif /* Timeout (in seconds) of addresses learned in the forwarding table. */ #ifndef VXLAN_FTABLE_TIMEOUT #define VXLAN_FTABLE_TIMEOUT (20 * 60) #endif /* * Maximum timeout (in seconds) of addresses learned in the forwarding * table. */ #ifndef VXLAN_FTABLE_MAX_TIMEOUT #define VXLAN_FTABLE_MAX_TIMEOUT (60 * 60 * 24) #endif /* Number of seconds between pruning attempts of the forwarding table. */ #ifndef VXLAN_FTABLE_PRUNE #define VXLAN_FTABLE_PRUNE (5 * 60) #endif static int vxlan_ftable_prune_period = VXLAN_FTABLE_PRUNE; struct vxlan_control { int (*vxlc_func)(struct vxlan_softc *, void *); int vxlc_argsize; int vxlc_flags; #define VXLAN_CTRL_FLAG_COPYIN 0x01 #define VXLAN_CTRL_FLAG_COPYOUT 0x02 #define VXLAN_CTRL_FLAG_SUSER 0x04 }; static const struct vxlan_control vxlan_control_table[] = { [VXLAN_CMD_GET_CONFIG] = { vxlan_ctrl_get_config, sizeof(struct ifvxlancfg), VXLAN_CTRL_FLAG_COPYOUT }, [VXLAN_CMD_SET_VNI] = { vxlan_ctrl_set_vni, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LOCAL_ADDR] = { vxlan_ctrl_set_local_addr, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_REMOTE_ADDR] = { vxlan_ctrl_set_remote_addr, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LOCAL_PORT] = { vxlan_ctrl_set_local_port, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_REMOTE_PORT] = { vxlan_ctrl_set_remote_port, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_PORT_RANGE] = { vxlan_ctrl_set_port_range, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_FTABLE_TIMEOUT] = { vxlan_ctrl_set_ftable_timeout, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_FTABLE_MAX] = { vxlan_ctrl_set_ftable_max, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_MULTICAST_IF] = { vxlan_ctrl_set_multicast_if, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_TTL] = { vxlan_ctrl_set_ttl, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LEARN] = { vxlan_ctrl_set_learn, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FTABLE_ENTRY_ADD] = { vxlan_ctrl_ftable_entry_add, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FTABLE_ENTRY_REM] = { vxlan_ctrl_ftable_entry_rem, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FLUSH] = { vxlan_ctrl_flush, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, }; static const int vxlan_control_table_size = nitems(vxlan_control_table); static int vxlan_ftable_addr_cmp(const uint8_t *a, const uint8_t *b) { int i, d; for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) d = ((int)a[i]) - ((int)b[i]); return (d); } static void vxlan_ftable_init(struct vxlan_softc *sc) { int i; sc->vxl_ftable = malloc(sizeof(struct vxlan_ftable_head) * VXLAN_SC_FTABLE_SIZE, M_VXLAN, M_ZERO | M_WAITOK); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) LIST_INIT(&sc->vxl_ftable[i]); sc->vxl_ftable_hash_key = arc4random(); } static void vxlan_ftable_fini(struct vxlan_softc *sc) { int i; for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { KASSERT(LIST_EMPTY(&sc->vxl_ftable[i]), ("%s: vxlan %p ftable[%d] not empty", __func__, sc, i)); } MPASS(sc->vxl_ftable_cnt == 0); free(sc->vxl_ftable, M_VXLAN); sc->vxl_ftable = NULL; } static void vxlan_ftable_flush(struct vxlan_softc *sc, int all) { struct vxlan_ftable_entry *fe, *tfe; int i; for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) { if (all || VXLAN_FE_IS_DYNAMIC(fe)) vxlan_ftable_entry_destroy(sc, fe); } } } static void vxlan_ftable_expire(struct vxlan_softc *sc) { struct vxlan_ftable_entry *fe, *tfe; int i; VXLAN_LOCK_WASSERT(sc); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) { if (VXLAN_FE_IS_DYNAMIC(fe) && time_uptime >= fe->vxlfe_expire) vxlan_ftable_entry_destroy(sc, fe); } } } static int vxlan_ftable_update_locked(struct vxlan_softc *sc, const struct sockaddr *sa, const uint8_t *mac, struct rm_priotracker *tracker) { union vxlan_sockaddr vxlsa; struct vxlan_ftable_entry *fe; int error; VXLAN_LOCK_ASSERT(sc); again: /* * A forwarding entry for this MAC address might already exist. If * so, update it, otherwise create a new one. We may have to upgrade * the lock if we have to change or create an entry. */ fe = vxlan_ftable_entry_lookup(sc, mac); if (fe != NULL) { fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout; if (!VXLAN_FE_IS_DYNAMIC(fe) || vxlan_sockaddr_in_equal(&fe->vxlfe_raddr, sa)) return (0); if (!VXLAN_LOCK_WOWNED(sc)) { VXLAN_RUNLOCK(sc, tracker); VXLAN_WLOCK(sc); sc->vxl_stats.ftable_lock_upgrade_failed++; goto again; } vxlan_sockaddr_in_copy(&fe->vxlfe_raddr, sa); return (0); } if (!VXLAN_LOCK_WOWNED(sc)) { VXLAN_RUNLOCK(sc, tracker); VXLAN_WLOCK(sc); sc->vxl_stats.ftable_lock_upgrade_failed++; goto again; } if (sc->vxl_ftable_cnt >= sc->vxl_ftable_max) { sc->vxl_stats.ftable_nospace++; return (ENOSPC); } fe = vxlan_ftable_entry_alloc(); if (fe == NULL) return (ENOMEM); /* * The source port may be randomly select by the remove host, so * use the port of the default destination address. */ vxlan_sockaddr_copy(&vxlsa, sa); vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port; vxlan_ftable_entry_init(sc, fe, mac, &vxlsa.sa, VXLAN_FE_FLAG_DYNAMIC); /* The prior lookup failed, so the insert should not. */ error = vxlan_ftable_entry_insert(sc, fe); MPASS(error == 0); return (0); } static int vxlan_ftable_update(struct vxlan_softc *sc, const struct sockaddr *sa, const uint8_t *mac) { struct rm_priotracker tracker; int error; VXLAN_RLOCK(sc, &tracker); error = vxlan_ftable_update_locked(sc, sa, mac, &tracker); VXLAN_UNLOCK(sc, &tracker); return (error); } static int vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sbuf sb; struct vxlan_softc *sc; struct vxlan_ftable_entry *fe; size_t size; int i, error; /* * This is mostly intended for debugging during development. It is * not practical to dump an entire large table this way. */ sc = arg1; size = PAGE_SIZE; /* Calculate later. */ sbuf_new(&sb, NULL, size, SBUF_FIXEDLEN); sbuf_putc(&sb, '\n'); VXLAN_RLOCK(sc, &tracker); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH(fe, &sc->vxl_ftable[i], vxlfe_hash) { if (sbuf_error(&sb) != 0) break; vxlan_ftable_entry_dump(fe, &sb); } } VXLAN_RUNLOCK(sc, &tracker); if (sbuf_len(&sb) == 1) sbuf_setpos(&sb, 0); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (error); } static struct vxlan_ftable_entry * vxlan_ftable_entry_alloc(void) { struct vxlan_ftable_entry *fe; fe = malloc(sizeof(*fe), M_VXLAN, M_ZERO | M_NOWAIT); return (fe); } static void vxlan_ftable_entry_free(struct vxlan_ftable_entry *fe) { free(fe, M_VXLAN); } static void vxlan_ftable_entry_init(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe, const uint8_t *mac, const struct sockaddr *sa, uint32_t flags) { fe->vxlfe_flags = flags; fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout; memcpy(fe->vxlfe_mac, mac, ETHER_ADDR_LEN); vxlan_sockaddr_copy(&fe->vxlfe_raddr, sa); } static void vxlan_ftable_entry_destroy(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe) { sc->vxl_ftable_cnt--; LIST_REMOVE(fe, vxlfe_hash); vxlan_ftable_entry_free(fe); } static int vxlan_ftable_entry_insert(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe) { struct vxlan_ftable_entry *lfe; uint32_t hash; int dir; VXLAN_LOCK_WASSERT(sc); hash = VXLAN_SC_FTABLE_HASH(sc, fe->vxlfe_mac); lfe = LIST_FIRST(&sc->vxl_ftable[hash]); if (lfe == NULL) { LIST_INSERT_HEAD(&sc->vxl_ftable[hash], fe, vxlfe_hash); goto out; } do { dir = vxlan_ftable_addr_cmp(fe->vxlfe_mac, lfe->vxlfe_mac); if (dir == 0) return (EEXIST); if (dir > 0) { LIST_INSERT_BEFORE(lfe, fe, vxlfe_hash); goto out; } else if (LIST_NEXT(lfe, vxlfe_hash) == NULL) { LIST_INSERT_AFTER(lfe, fe, vxlfe_hash); goto out; } else lfe = LIST_NEXT(lfe, vxlfe_hash); } while (lfe != NULL); out: sc->vxl_ftable_cnt++; return (0); } static struct vxlan_ftable_entry * vxlan_ftable_entry_lookup(struct vxlan_softc *sc, const uint8_t *mac) { struct vxlan_ftable_entry *fe; uint32_t hash; int dir; VXLAN_LOCK_ASSERT(sc); hash = VXLAN_SC_FTABLE_HASH(sc, mac); LIST_FOREACH(fe, &sc->vxl_ftable[hash], vxlfe_hash) { dir = vxlan_ftable_addr_cmp(fe->vxlfe_mac, mac); if (dir == 0) return (fe); if (dir > 0) break; } return (NULL); } static void vxlan_ftable_entry_dump(struct vxlan_ftable_entry *fe, struct sbuf *sb) { char buf[64]; const union vxlan_sockaddr *sa; const void *addr; int i, len, af, width; sa = &fe->vxlfe_raddr; af = sa->sa.sa_family; len = sbuf_len(sb); sbuf_printf(sb, "%c 0x%02X ", VXLAN_FE_IS_DYNAMIC(fe) ? 'D' : 'S', fe->vxlfe_flags); for (i = 0; i < ETHER_ADDR_LEN - 1; i++) sbuf_printf(sb, "%02X:", fe->vxlfe_mac[i]); sbuf_printf(sb, "%02X ", fe->vxlfe_mac[i]); if (af == AF_INET) { addr = &sa->in4.sin_addr; width = INET_ADDRSTRLEN - 1; } else { addr = &sa->in6.sin6_addr; width = INET6_ADDRSTRLEN - 1; } inet_ntop(af, addr, buf, sizeof(buf)); sbuf_printf(sb, "%*s ", width, buf); sbuf_printf(sb, "%08jd", (intmax_t)fe->vxlfe_expire); sbuf_putc(sb, '\n'); /* Truncate a partial line. */ if (sbuf_error(sb) != 0) sbuf_setpos(sb, len); } static struct vxlan_socket * vxlan_socket_alloc(const union vxlan_sockaddr *sa) { struct vxlan_socket *vso; int i; vso = malloc(sizeof(*vso), M_VXLAN, M_WAITOK | M_ZERO); rm_init(&vso->vxlso_lock, "vxlansorm"); refcount_init(&vso->vxlso_refcnt, 0); for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) LIST_INIT(&vso->vxlso_vni_hash[i]); vso->vxlso_laddr = *sa; return (vso); } static void vxlan_socket_destroy(struct vxlan_socket *vso) { struct socket *so; struct vxlan_socket_mc_info *mc; int i; for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; KASSERT(mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC, ("%s: socket %p mc[%d] still has address", __func__, vso, i)); } for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) { KASSERT(LIST_EMPTY(&vso->vxlso_vni_hash[i]), ("%s: socket %p vni_hash[%d] not empty", __func__, vso, i)); } so = vso->vxlso_sock; if (so != NULL) { vso->vxlso_sock = NULL; soclose(so); } rm_destroy(&vso->vxlso_lock); free(vso, M_VXLAN); } static void vxlan_socket_release(struct vxlan_socket *vso) { int destroy; mtx_lock(&vxlan_list_mtx); destroy = VXLAN_SO_RELEASE(vso); if (destroy != 0) LIST_REMOVE(vso, vxlso_entry); mtx_unlock(&vxlan_list_mtx); if (destroy != 0) vxlan_socket_destroy(vso); } static struct vxlan_socket * vxlan_socket_lookup(union vxlan_sockaddr *vxlsa) { struct vxlan_socket *vso; mtx_lock(&vxlan_list_mtx); LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) { if (vxlan_sockaddr_cmp(&vso->vxlso_laddr, &vxlsa->sa) == 0) { VXLAN_SO_ACQUIRE(vso); break; } } mtx_unlock(&vxlan_list_mtx); return (vso); } static void vxlan_socket_insert(struct vxlan_socket *vso) { mtx_lock(&vxlan_list_mtx); VXLAN_SO_ACQUIRE(vso); LIST_INSERT_HEAD(&vxlan_socket_list, vso, vxlso_entry); mtx_unlock(&vxlan_list_mtx); } static int vxlan_socket_init(struct vxlan_socket *vso, struct ifnet *ifp) { struct thread *td; int error; td = curthread; error = socreate(vso->vxlso_laddr.sa.sa_family, &vso->vxlso_sock, SOCK_DGRAM, IPPROTO_UDP, td->td_ucred, td); if (error) { if_printf(ifp, "cannot create socket: %d\n", error); return (error); } error = udp_set_kernel_tunneling(vso->vxlso_sock, vxlan_rcv_udp_packet, vso); if (error) { if_printf(ifp, "cannot set tunneling function: %d\n", error); return (error); } if (vxlan_reuse_port != 0) { struct sockopt sopt; int val = 1; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = SO_REUSEPORT; sopt.sopt_val = &val; sopt.sopt_valsize = sizeof(val); error = sosetopt(vso->vxlso_sock, &sopt); if (error) { if_printf(ifp, "cannot set REUSEADDR socket opt: %d\n", error); return (error); } } return (0); } static int vxlan_socket_bind(struct vxlan_socket *vso, struct ifnet *ifp) { union vxlan_sockaddr laddr; struct thread *td; int error; td = curthread; laddr = vso->vxlso_laddr; error = sobind(vso->vxlso_sock, &laddr.sa, td); if (error) { if (error != EADDRINUSE) if_printf(ifp, "cannot bind socket: %d\n", error); return (error); } return (0); } static int vxlan_socket_create(struct ifnet *ifp, int multicast, const union vxlan_sockaddr *saddr, struct vxlan_socket **vsop) { union vxlan_sockaddr laddr; struct vxlan_socket *vso; int error; laddr = *saddr; /* * If this socket will be multicast, then only the local port * must be specified when binding. */ if (multicast != 0) { if (VXLAN_SOCKADDR_IS_IPV4(&laddr)) laddr.in4.sin_addr.s_addr = INADDR_ANY; #ifdef INET6 else laddr.in6.sin6_addr = in6addr_any; #endif } vso = vxlan_socket_alloc(&laddr); if (vso == NULL) return (ENOMEM); error = vxlan_socket_init(vso, ifp); if (error) goto fail; error = vxlan_socket_bind(vso, ifp); if (error) goto fail; /* * There is a small window between the bind completing and * inserting the socket, so that a concurrent create may fail. * Let's not worry about that for now. */ vxlan_socket_insert(vso); *vsop = vso; return (0); fail: vxlan_socket_destroy(vso); return (error); } static void vxlan_socket_ifdetach(struct vxlan_socket *vso, struct ifnet *ifp, struct vxlan_softc_head *list) { struct rm_priotracker tracker; struct vxlan_softc *sc; int i; VXLAN_SO_RLOCK(vso, &tracker); for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) { LIST_FOREACH(sc, &vso->vxlso_vni_hash[i], vxl_entry) vxlan_ifdetach(sc, ifp, list); } VXLAN_SO_RUNLOCK(vso, &tracker); } static struct vxlan_socket * vxlan_socket_mc_lookup(const union vxlan_sockaddr *vxlsa) { struct vxlan_socket *vso; union vxlan_sockaddr laddr; laddr = *vxlsa; if (VXLAN_SOCKADDR_IS_IPV4(&laddr)) laddr.in4.sin_addr.s_addr = INADDR_ANY; #ifdef INET6 else laddr.in6.sin6_addr = in6addr_any; #endif vso = vxlan_socket_lookup(&laddr); return (vso); } static int vxlan_sockaddr_mc_info_match(const struct vxlan_socket_mc_info *mc, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int ifidx) { if (!vxlan_sockaddr_in_any(local) && !vxlan_sockaddr_in_equal(&mc->vxlsomc_saddr, &local->sa)) return (0); if (!vxlan_sockaddr_in_equal(&mc->vxlsomc_gaddr, &group->sa)) return (0); if (ifidx != 0 && ifidx != mc->vxlsomc_ifidx) return (0); return (1); } static int vxlan_socket_mc_join_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int *ifidx, union vxlan_sockaddr *source) { struct sockopt sopt; int error; *source = *local; if (VXLAN_SOCKADDR_IS_IPV4(group)) { struct ip_mreq mreq; mreq.imr_multiaddr = group->in4.sin_addr; mreq.imr_interface = local->in4.sin_addr; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_ADD_MEMBERSHIP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); if (error) return (error); /* * BMV: Ideally, there would be a formal way for us to get * the local interface that was selected based on the * imr_interface address. We could then update *ifidx so * vxlan_sockaddr_mc_info_match() would return a match for * later creates that explicitly set the multicast interface. * * If we really need to, we can of course look in the INP's * membership list: * sotoinpcb(vso->vxlso_sock)->inp_moptions-> * imo_membership[]->inm_ifp * similarly to imo_match_group(). */ source->in4.sin_addr = local->in4.sin_addr; } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { struct ipv6_mreq mreq; mreq.ipv6mr_multiaddr = group->in6.sin6_addr; mreq.ipv6mr_interface = *ifidx; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_JOIN_GROUP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); if (error) return (error); /* * BMV: As with IPv4, we would really like to know what * interface in6p_lookup_mcast_ifp() selected. */ } else error = EAFNOSUPPORT; return (error); } static int vxlan_socket_mc_leave_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *source, int ifidx) { struct sockopt sopt; int error; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; if (VXLAN_SOCKADDR_IS_IPV4(group)) { struct ip_mreq mreq; mreq.imr_multiaddr = group->in4.sin_addr; mreq.imr_interface = source->in4.sin_addr; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_DROP_MEMBERSHIP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { struct ipv6_mreq mreq; mreq.ipv6mr_multiaddr = group->in6.sin6_addr; mreq.ipv6mr_interface = ifidx; sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_LEAVE_GROUP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); } else error = EAFNOSUPPORT; return (error); } static int vxlan_socket_mc_add_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int ifidx, int *idx) { union vxlan_sockaddr source; struct vxlan_socket_mc_info *mc; int i, empty, error; /* * Within a socket, the same multicast group may be used by multiple * interfaces, each with a different network identifier. But a socket * may only join a multicast group once, so keep track of the users * here. */ VXLAN_SO_WLOCK(vso); for (empty = 0, i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) { empty++; continue; } if (vxlan_sockaddr_mc_info_match(mc, group, local, ifidx)) goto out; } VXLAN_SO_WUNLOCK(vso); if (empty == 0) return (ENOSPC); error = vxlan_socket_mc_join_group(vso, group, local, &ifidx, &source); if (error) return (error); VXLAN_SO_WLOCK(vso); for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) { vxlan_sockaddr_copy(&mc->vxlsomc_gaddr, &group->sa); vxlan_sockaddr_copy(&mc->vxlsomc_saddr, &source.sa); mc->vxlsomc_ifidx = ifidx; goto out; } } VXLAN_SO_WUNLOCK(vso); error = vxlan_socket_mc_leave_group(vso, group, &source, ifidx); MPASS(error == 0); return (ENOSPC); out: mc->vxlsomc_users++; VXLAN_SO_WUNLOCK(vso); *idx = i; return (0); } static void vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *vso, int idx) { union vxlan_sockaddr group, source; struct vxlan_socket_mc_info *mc; int ifidx, leave; KASSERT(idx >= 0 && idx < VXLAN_SO_MC_MAX_GROUPS, ("%s: vso %p idx %d out of bounds", __func__, vso, idx)); leave = 0; mc = &vso->vxlso_mc[idx]; VXLAN_SO_WLOCK(vso); mc->vxlsomc_users--; if (mc->vxlsomc_users == 0) { group = mc->vxlsomc_gaddr; source = mc->vxlsomc_saddr; ifidx = mc->vxlsomc_ifidx; bzero(mc, sizeof(*mc)); leave = 1; } VXLAN_SO_WUNLOCK(vso); if (leave != 0) { /* * Our socket's membership in this group may have already * been removed if we joined through an interface that's * been detached. */ vxlan_socket_mc_leave_group(vso, &group, &source, ifidx); } } static struct vxlan_softc * vxlan_socket_lookup_softc_locked(struct vxlan_socket *vso, uint32_t vni) { struct vxlan_softc *sc; uint32_t hash; VXLAN_SO_LOCK_ASSERT(vso); hash = VXLAN_SO_VNI_HASH(vni); LIST_FOREACH(sc, &vso->vxlso_vni_hash[hash], vxl_entry) { if (sc->vxl_vni == vni) { VXLAN_ACQUIRE(sc); break; } } return (sc); } static struct vxlan_softc * vxlan_socket_lookup_softc(struct vxlan_socket *vso, uint32_t vni) { struct rm_priotracker tracker; struct vxlan_softc *sc; VXLAN_SO_RLOCK(vso, &tracker); sc = vxlan_socket_lookup_softc_locked(vso, vni); VXLAN_SO_RUNLOCK(vso, &tracker); return (sc); } static int vxlan_socket_insert_softc(struct vxlan_socket *vso, struct vxlan_softc *sc) { struct vxlan_softc *tsc; uint32_t vni, hash; vni = sc->vxl_vni; hash = VXLAN_SO_VNI_HASH(vni); VXLAN_SO_WLOCK(vso); tsc = vxlan_socket_lookup_softc_locked(vso, vni); if (tsc != NULL) { VXLAN_SO_WUNLOCK(vso); vxlan_release(tsc); return (EEXIST); } VXLAN_ACQUIRE(sc); LIST_INSERT_HEAD(&vso->vxlso_vni_hash[hash], sc, vxl_entry); VXLAN_SO_WUNLOCK(vso); return (0); } static void vxlan_socket_remove_softc(struct vxlan_socket *vso, struct vxlan_softc *sc) { VXLAN_SO_WLOCK(vso); LIST_REMOVE(sc, vxl_entry); VXLAN_SO_WUNLOCK(vso); vxlan_release(sc); } static struct ifnet * vxlan_multicast_if_ref(struct vxlan_softc *sc, int ipv4) { struct ifnet *ifp; VXLAN_LOCK_ASSERT(sc); if (ipv4 && sc->vxl_im4o != NULL) ifp = sc->vxl_im4o->imo_multicast_ifp; else if (!ipv4 && sc->vxl_im6o != NULL) ifp = sc->vxl_im6o->im6o_multicast_ifp; else ifp = NULL; if (ifp != NULL) if_ref(ifp); return (ifp); } static void vxlan_free_multicast(struct vxlan_softc *sc) { if (sc->vxl_mc_ifp != NULL) { if_rele(sc->vxl_mc_ifp); sc->vxl_mc_ifp = NULL; sc->vxl_mc_ifindex = 0; } if (sc->vxl_im4o != NULL) { free(sc->vxl_im4o, M_VXLAN); sc->vxl_im4o = NULL; } if (sc->vxl_im6o != NULL) { free(sc->vxl_im6o, M_VXLAN); sc->vxl_im6o = NULL; } } static int vxlan_setup_multicast_interface(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = ifunit_ref(sc->vxl_mc_ifname); if (ifp == NULL) { if_printf(sc->vxl_ifp, "multicast interfaces %s does " "not exist\n", sc->vxl_mc_ifname); return (ENOENT); } if ((ifp->if_flags & IFF_MULTICAST) == 0) { if_printf(sc->vxl_ifp, "interface %s does not support " "multicast\n", sc->vxl_mc_ifname); if_rele(ifp); return (ENOTSUP); } sc->vxl_mc_ifp = ifp; sc->vxl_mc_ifindex = ifp->if_index; return (0); } static int vxlan_setup_multicast(struct vxlan_softc *sc) { const union vxlan_sockaddr *group; int error; group = &sc->vxl_dst_addr; error = 0; if (sc->vxl_mc_ifname[0] != '\0') { error = vxlan_setup_multicast_interface(sc); if (error) return (error); } /* * Initialize an multicast options structure that is sufficiently * populated for use in the respective IP output routine. This * structure is typically stored in the socket, but our sockets * may be shared among multiple interfaces. */ if (VXLAN_SOCKADDR_IS_IPV4(group)) { sc->vxl_im4o = malloc(sizeof(struct ip_moptions), M_VXLAN, M_ZERO | M_WAITOK); sc->vxl_im4o->imo_multicast_ifp = sc->vxl_mc_ifp; sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl; sc->vxl_im4o->imo_multicast_vif = -1; } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { sc->vxl_im6o = malloc(sizeof(struct ip6_moptions), M_VXLAN, M_ZERO | M_WAITOK); sc->vxl_im6o->im6o_multicast_ifp = sc->vxl_mc_ifp; sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl; } return (error); } static int vxlan_setup_socket(struct vxlan_softc *sc) { struct vxlan_socket *vso; struct ifnet *ifp; union vxlan_sockaddr *saddr, *daddr; int multicast, error; vso = NULL; ifp = sc->vxl_ifp; saddr = &sc->vxl_src_addr; daddr = &sc->vxl_dst_addr; multicast = vxlan_sockaddr_in_multicast(daddr); MPASS(multicast != -1); sc->vxl_vso_mc_index = -1; /* * Try to create the socket. If that fails, attempt to use an * existing socket. */ error = vxlan_socket_create(ifp, multicast, saddr, &vso); if (error) { if (multicast != 0) vso = vxlan_socket_mc_lookup(saddr); else vso = vxlan_socket_lookup(saddr); if (vso == NULL) { if_printf(ifp, "cannot create socket (error: %d), " "and no existing socket found\n", error); goto out; } } if (multicast != 0) { error = vxlan_setup_multicast(sc); if (error) goto out; error = vxlan_socket_mc_add_group(vso, daddr, saddr, sc->vxl_mc_ifindex, &sc->vxl_vso_mc_index); if (error) goto out; } sc->vxl_sock = vso; error = vxlan_socket_insert_softc(vso, sc); if (error) { sc->vxl_sock = NULL; if_printf(ifp, "network identifier %d already exists in " "this socket\n", sc->vxl_vni); goto out; } return (0); out: if (vso != NULL) { if (sc->vxl_vso_mc_index != -1) { vxlan_socket_mc_release_group_by_idx(vso, sc->vxl_vso_mc_index); sc->vxl_vso_mc_index = -1; } if (multicast != 0) vxlan_free_multicast(sc); vxlan_socket_release(vso); } return (error); } static void vxlan_setup_interface(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; ifp->if_hdrlen = ETHER_HDR_LEN + sizeof(struct vxlanudphdr); if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr) != 0) ifp->if_hdrlen += sizeof(struct ip); else if (VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_dst_addr) != 0) ifp->if_hdrlen += sizeof(struct ip6_hdr); } static int vxlan_valid_init_config(struct vxlan_softc *sc) { const char *reason; if (vxlan_check_vni(sc->vxl_vni) != 0) { reason = "invalid virtual network identifier specified"; goto fail; } if (vxlan_sockaddr_supported(&sc->vxl_src_addr, 1) == 0) { reason = "source address type is not supported"; goto fail; } if (vxlan_sockaddr_supported(&sc->vxl_dst_addr, 0) == 0) { reason = "destination address type is not supported"; goto fail; } if (vxlan_sockaddr_in_any(&sc->vxl_dst_addr) != 0) { reason = "no valid destination address specified"; goto fail; } if (vxlan_sockaddr_in_multicast(&sc->vxl_dst_addr) == 0 && sc->vxl_mc_ifname[0] != '\0') { reason = "can only specify interface with a group address"; goto fail; } if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) { if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_src_addr) ^ VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr)) { reason = "source and destination address must both " "be either IPv4 or IPv6"; goto fail; } } if (sc->vxl_src_addr.in4.sin_port == 0) { reason = "local port not specified"; goto fail; } if (sc->vxl_dst_addr.in4.sin_port == 0) { reason = "remote port not specified"; goto fail; } return (0); fail: if_printf(sc->vxl_ifp, "cannot initialize interface: %s\n", reason); return (EINVAL); } static void vxlan_init_wait(struct vxlan_softc *sc) { VXLAN_LOCK_WASSERT(sc); while (sc->vxl_flags & VXLAN_FLAG_INIT) rm_sleep(sc, &sc->vxl_lock, 0, "vxlint", hz); } static void vxlan_init_complete(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); sc->vxl_flags &= ~VXLAN_FLAG_INIT; wakeup(sc); VXLAN_WUNLOCK(sc); } static void vxlan_init(void *xsc) { static const uint8_t empty_mac[ETHER_ADDR_LEN]; struct vxlan_softc *sc; struct ifnet *ifp; sc = xsc; ifp = sc->vxl_ifp; VXLAN_WLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { VXLAN_WUNLOCK(sc); return; } sc->vxl_flags |= VXLAN_FLAG_INIT; VXLAN_WUNLOCK(sc); if (vxlan_valid_init_config(sc) != 0) goto out; vxlan_setup_interface(sc); if (vxlan_setup_socket(sc) != 0) goto out; /* Initialize the default forwarding entry. */ vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac, &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC); VXLAN_WLOCK(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; callout_reset(&sc->vxl_callout, vxlan_ftable_prune_period * hz, vxlan_timer, sc); VXLAN_WUNLOCK(sc); out: vxlan_init_complete(sc); } static void vxlan_release(struct vxlan_softc *sc) { /* * The softc may be destroyed as soon as we release our reference, * so we cannot serialize the wakeup with the softc lock. We use a * timeout in our sleeps so a missed wakeup is unfortunate but not * fatal. */ if (VXLAN_RELEASE(sc) != 0) wakeup(sc); } static void vxlan_teardown_wait(struct vxlan_softc *sc) { VXLAN_LOCK_WASSERT(sc); while (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) rm_sleep(sc, &sc->vxl_lock, 0, "vxltrn", hz); } static void vxlan_teardown_complete(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); sc->vxl_flags &= ~VXLAN_FLAG_TEARDOWN; wakeup(sc); VXLAN_WUNLOCK(sc); } static void vxlan_teardown_locked(struct vxlan_softc *sc) { struct ifnet *ifp; struct vxlan_socket *vso; ifp = sc->vxl_ifp; VXLAN_LOCK_WASSERT(sc); MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN); ifp->if_flags &= ~IFF_UP; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->vxl_callout); vso = sc->vxl_sock; sc->vxl_sock = NULL; VXLAN_WUNLOCK(sc); if (vso != NULL) { vxlan_socket_remove_softc(vso, sc); if (sc->vxl_vso_mc_index != -1) { vxlan_socket_mc_release_group_by_idx(vso, sc->vxl_vso_mc_index); sc->vxl_vso_mc_index = -1; } } VXLAN_WLOCK(sc); while (sc->vxl_refcnt != 0) rm_sleep(sc, &sc->vxl_lock, 0, "vxldrn", hz); VXLAN_WUNLOCK(sc); callout_drain(&sc->vxl_callout); vxlan_free_multicast(sc); if (vso != NULL) vxlan_socket_release(vso); vxlan_teardown_complete(sc); } static void vxlan_teardown(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) { vxlan_teardown_wait(sc); VXLAN_WUNLOCK(sc); return; } sc->vxl_flags |= VXLAN_FLAG_TEARDOWN; vxlan_teardown_locked(sc); } static void vxlan_ifdetach(struct vxlan_softc *sc, struct ifnet *ifp, struct vxlan_softc_head *list) { VXLAN_WLOCK(sc); if (sc->vxl_mc_ifp != ifp) goto out; if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) goto out; sc->vxl_flags |= VXLAN_FLAG_TEARDOWN; LIST_INSERT_HEAD(list, sc, vxl_ifdetach_list); out: VXLAN_WUNLOCK(sc); } static void vxlan_timer(void *xsc) { struct vxlan_softc *sc; sc = xsc; VXLAN_LOCK_WASSERT(sc); vxlan_ftable_expire(sc); callout_schedule(&sc->vxl_callout, vxlan_ftable_prune_period * hz); } static int vxlan_ioctl_ifflags(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) vxlan_init(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vxlan_teardown(sc); } return (0); } static int vxlan_ctrl_get_config(struct vxlan_softc *sc, void *arg) { struct rm_priotracker tracker; struct ifvxlancfg *cfg; cfg = arg; bzero(cfg, sizeof(*cfg)); VXLAN_RLOCK(sc, &tracker); cfg->vxlc_vni = sc->vxl_vni; memcpy(&cfg->vxlc_local_sa, &sc->vxl_src_addr, sizeof(union vxlan_sockaddr)); memcpy(&cfg->vxlc_remote_sa, &sc->vxl_dst_addr, sizeof(union vxlan_sockaddr)); cfg->vxlc_mc_ifindex = sc->vxl_mc_ifindex; cfg->vxlc_ftable_cnt = sc->vxl_ftable_cnt; cfg->vxlc_ftable_max = sc->vxl_ftable_max; cfg->vxlc_ftable_timeout = sc->vxl_ftable_timeout; cfg->vxlc_port_min = sc->vxl_min_port; cfg->vxlc_port_max = sc->vxl_max_port; cfg->vxlc_learn = (sc->vxl_flags & VXLAN_FLAG_LEARN) != 0; cfg->vxlc_ttl = sc->vxl_ttl; VXLAN_RUNLOCK(sc, &tracker); return (0); } static int vxlan_ctrl_set_vni(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (vxlan_check_vni(cmd->vxlcmd_vni) != 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_vni = cmd->vxlcmd_vni; error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; union vxlan_sockaddr *vxlsa; int error; cmd = arg; vxlsa = &cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa)) return (EINVAL); if (vxlan_sockaddr_in_multicast(vxlsa) != 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; union vxlan_sockaddr *vxlsa; int error; cmd = arg; vxlsa = &cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa)) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_local_port(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (cmd->vxlcmd_port == 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_src_addr.in4.sin_port = htons(cmd->vxlcmd_port); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_remote_port(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (cmd->vxlcmd_port == 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_dst_addr.in4.sin_port = htons(cmd->vxlcmd_port); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_port_range(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; uint16_t min, max; int error; cmd = arg; min = cmd->vxlcmd_port_min; max = cmd->vxlcmd_port_max; if (max < min) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_min_port = min; sc->vxl_max_port = max; error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ftable_timeout(cmd->vxlcmd_ftable_timeout) == 0) { sc->vxl_ftable_timeout = cmd->vxlcmd_ftable_timeout; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ftable_max(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ftable_max(cmd->vxlcmd_ftable_max) == 0) { sc->vxl_ftable_max = cmd->vxlcmd_ftable_max; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ttl(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ttl(cmd->vxlcmd_ttl) == 0) { sc->vxl_ttl = cmd->vxlcmd_ttl; if (sc->vxl_im4o != NULL) sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl; if (sc->vxl_im6o != NULL) sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_learn(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; cmd = arg; VXLAN_WLOCK(sc); if (cmd->vxlcmd_flags & VXLAN_CMD_FLAG_LEARN) sc->vxl_flags |= VXLAN_FLAG_LEARN; else sc->vxl_flags &= ~VXLAN_FLAG_LEARN; VXLAN_WUNLOCK(sc); return (0); } static int vxlan_ctrl_ftable_entry_add(struct vxlan_softc *sc, void *arg) { union vxlan_sockaddr vxlsa; struct ifvxlancmd *cmd; struct vxlan_ftable_entry *fe; int error; cmd = arg; vxlsa = cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(&vxlsa)) return (EINVAL); if (vxlan_sockaddr_in_any(&vxlsa) != 0) return (EINVAL); if (vxlan_sockaddr_in_multicast(&vxlsa) != 0) return (EINVAL); /* BMV: We could support both IPv4 and IPv6 later. */ if (vxlsa.sa.sa_family != sc->vxl_dst_addr.sa.sa_family) return (EAFNOSUPPORT); fe = vxlan_ftable_entry_alloc(); if (fe == NULL) return (ENOMEM); if (vxlsa.in4.sin_port == 0) vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port; vxlan_ftable_entry_init(sc, fe, cmd->vxlcmd_mac, &vxlsa.sa, VXLAN_FE_FLAG_STATIC); VXLAN_WLOCK(sc); error = vxlan_ftable_entry_insert(sc, fe); VXLAN_WUNLOCK(sc); if (error) vxlan_ftable_entry_free(fe); return (error); } static int vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; struct vxlan_ftable_entry *fe; int error; cmd = arg; VXLAN_WLOCK(sc); fe = vxlan_ftable_entry_lookup(sc, cmd->vxlcmd_mac); if (fe != NULL) { vxlan_ftable_entry_destroy(sc, fe); error = 0; } else error = ENOENT; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_flush(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int all; cmd = arg; all = cmd->vxlcmd_flags & VXLAN_CMD_FLAG_FLUSH_ALL; VXLAN_WLOCK(sc); vxlan_ftable_flush(sc, all); VXLAN_WUNLOCK(sc); return (0); } static int vxlan_ioctl_drvspec(struct vxlan_softc *sc, struct ifdrv *ifd, int get) { const struct vxlan_control *vc; union { struct ifvxlancfg cfg; struct ifvxlancmd cmd; } args; int out, error; if (ifd->ifd_cmd >= vxlan_control_table_size) return (EINVAL); bzero(&args, sizeof(args)); vc = &vxlan_control_table[ifd->ifd_cmd]; out = (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) != 0; if ((get != 0 && out == 0) || (get == 0 && out != 0)) return (EINVAL); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_SUSER) { error = priv_check(curthread, PRIV_NET_VXLAN); if (error) return (error); } if (ifd->ifd_len != vc->vxlc_argsize || ifd->ifd_len > sizeof(args)) return (EINVAL); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYIN) { error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) return (error); } error = vc->vxlc_func(sc, &args); if (error) return (error); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) { error = copyout(&args, ifd->ifd_data, ifd->ifd_len); if (error) return (error); } return (0); } static int vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct vxlan_softc *sc; struct ifreq *ifr; struct ifdrv *ifd; int error; sc = ifp->if_softc; ifr = (struct ifreq *) data; ifd = (struct ifdrv *) data; switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCGDRVSPEC: case SIOCSDRVSPEC: error = vxlan_ioctl_drvspec(sc, ifd, cmd == SIOCGDRVSPEC); break; case SIOCSIFFLAGS: error = vxlan_ioctl_ifflags(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #if defined(INET) || defined(INET6) static uint16_t vxlan_pick_source_port(struct vxlan_softc *sc, struct mbuf *m) { int range; uint32_t hash; range = sc->vxl_max_port - sc->vxl_min_port + 1; + /* check if flowid is set and not opaque */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE && M_HASHTYPE_GET(m) != M_HASHTYPE_OPAQUE) hash = m->m_pkthdr.flowid; else hash = jenkins_hash(m->m_data, ETHER_HDR_LEN, sc->vxl_port_hash_key); return (sc->vxl_min_port + (hash % range)); } static void vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff, uint16_t srcport, uint16_t dstport) { struct vxlanudphdr *hdr; struct udphdr *udph; struct vxlan_header *vxh; int len; len = m->m_pkthdr.len - ipoff; MPASS(len >= sizeof(struct vxlanudphdr)); hdr = mtodo(m, ipoff); udph = &hdr->vxlh_udp; udph->uh_sport = srcport; udph->uh_dport = dstport; udph->uh_ulen = htons(len); udph->uh_sum = 0; vxh = &hdr->vxlh_hdr; vxh->vxlh_flags = htonl(VXLAN_HDR_FLAGS_VALID_VNI); vxh->vxlh_vni = htonl(sc->vxl_vni << VXLAN_HDR_VNI_SHIFT); } #endif static int vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) { #ifdef INET struct ifnet *ifp; struct ip *ip; struct in_addr srcaddr, dstaddr; uint16_t srcport, dstport; int len, mcast, error; ifp = sc->vxl_ifp; srcaddr = sc->vxl_src_addr.in4.sin_addr; srcport = vxlan_pick_source_port(sc, m); dstaddr = fvxlsa->in4.sin_addr; dstport = fvxlsa->in4.sin_port; M_PREPEND(m, sizeof(struct ip) + sizeof(struct vxlanudphdr), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } len = m->m_pkthdr.len; ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(len); ip->ip_off = 0; ip->ip_ttl = sc->vxl_ttl; ip->ip_p = IPPROTO_UDP; ip->ip_sum = 0; ip->ip_src = srcaddr; ip->ip_dst = dstaddr; vxlan_encap_header(sc, m, sizeof(struct ip), srcport, dstport); mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mcast != 0) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); #else m_freem(m); return (ENOTSUP); #endif } static int vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) { #ifdef INET6 struct ifnet *ifp; struct ip6_hdr *ip6; const struct in6_addr *srcaddr, *dstaddr; uint16_t srcport, dstport; int len, mcast, error; ifp = sc->vxl_ifp; srcaddr = &sc->vxl_src_addr.in6.sin6_addr; srcport = vxlan_pick_source_port(sc, m); dstaddr = &fvxlsa->in6.sin6_addr; dstport = fvxlsa->in6.sin6_port; M_PREPEND(m, sizeof(struct ip6_hdr) + sizeof(struct vxlanudphdr), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } len = m->m_pkthdr.len; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; /* BMV: Keep in forwarding entry? */ ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = 0; ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_hlim = sc->vxl_ttl; ip6->ip6_src = *srcaddr; ip6->ip6_dst = *dstaddr; vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport); /* * XXX BMV We need support for RFC6935 before we can send and * receive IPv6 UDP packets with a zero checksum. */ { struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr)); hdr->uh_sum = in6_cksum_pseudo(ip6, m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mcast != 0) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); #else m_freem(m); return (ENOTSUP); #endif } static int vxlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker tracker; union vxlan_sockaddr vxlsa; struct vxlan_softc *sc; struct vxlan_ftable_entry *fe; struct ifnet *mcifp; struct ether_header *eh; int ipv4, error; sc = ifp->if_softc; eh = mtod(m, struct ether_header *); fe = NULL; mcifp = NULL; ETHER_BPF_MTAP(ifp, m); VXLAN_RLOCK(sc, &tracker); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VXLAN_RUNLOCK(sc, &tracker); m_freem(m); return (ENETDOWN); } if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) fe = vxlan_ftable_entry_lookup(sc, eh->ether_dhost); if (fe == NULL) fe = &sc->vxl_default_fe; vxlan_sockaddr_copy(&vxlsa, &fe->vxlfe_raddr.sa); ipv4 = VXLAN_SOCKADDR_IS_IPV4(&vxlsa) != 0; if (vxlan_sockaddr_in_multicast(&vxlsa) != 0) mcifp = vxlan_multicast_if_ref(sc, ipv4); VXLAN_ACQUIRE(sc); VXLAN_RUNLOCK(sc, &tracker); if (ipv4 != 0) error = vxlan_encap4(sc, &vxlsa, m); else error = vxlan_encap6(sc, &vxlsa, m); vxlan_release(sc); if (mcifp != NULL) if_rele(mcifp); return (error); } static void vxlan_qflush(struct ifnet *ifp __unused) { } static void vxlan_rcv_udp_packet(struct mbuf *m, int offset, struct inpcb *inpcb, const struct sockaddr *srcsa, void *xvso) { struct vxlan_socket *vso; struct vxlan_header *vxh, vxlanhdr; uint32_t vni; int error; M_ASSERTPKTHDR(m); vso = xvso; offset += sizeof(struct udphdr); if (m->m_pkthdr.len < offset + sizeof(struct vxlan_header)) goto out; if (__predict_false(m->m_len < offset + sizeof(struct vxlan_header))) { m_copydata(m, offset, sizeof(struct vxlan_header), (caddr_t) &vxlanhdr); vxh = &vxlanhdr; } else vxh = mtodo(m, offset); /* * Drop if there is a reserved bit set in either the flags or VNI * fields of the header. This goes against the specification, but * a bit set may indicate an unsupported new feature. This matches * the behavior of the Linux implementation. */ if (vxh->vxlh_flags != htonl(VXLAN_HDR_FLAGS_VALID_VNI) || vxh->vxlh_vni & ~htonl(VXLAN_VNI_MASK)) goto out; vni = ntohl(vxh->vxlh_vni) >> VXLAN_HDR_VNI_SHIFT; /* Adjust to the start of the inner Ethernet frame. */ m_adj(m, offset + sizeof(struct vxlan_header)); error = vxlan_input(vso, vni, &m, srcsa); MPASS(error != 0 || m == NULL); out: if (m != NULL) m_freem(m); } static int vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0, const struct sockaddr *sa) { struct vxlan_softc *sc; struct ifnet *ifp; struct mbuf *m; struct ether_header *eh; int error; sc = vxlan_socket_lookup_softc(vso, vni); if (sc == NULL) return (ENOENT); ifp = sc->vxl_ifp; m = *m0; eh = mtod(m, struct ether_header *); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { error = ENETDOWN; goto out; } else if (ifp == m->m_pkthdr.rcvif) { /* XXX Does not catch more complex loops. */ error = EDEADLK; goto out; } if (sc->vxl_flags & VXLAN_FLAG_LEARN) vxlan_ftable_update(sc, sa, eh->ether_shost); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); error = netisr_queue_src(NETISR_ETHER, 0, m); *m0 = NULL; out: vxlan_release(sc); return (error); } static void vxlan_set_default_config(struct vxlan_softc *sc) { sc->vxl_flags |= VXLAN_FLAG_LEARN; sc->vxl_vni = VXLAN_VNI_MAX; sc->vxl_ttl = IPDEFTTL; if (!vxlan_tunable_int(sc, "legacy_port", vxlan_legacy_port)) { sc->vxl_src_addr.in4.sin_port = htons(VXLAN_PORT); sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_PORT); } else { sc->vxl_src_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT); sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT); } sc->vxl_min_port = V_ipport_firstauto; sc->vxl_max_port = V_ipport_lastauto; sc->vxl_ftable_max = VXLAN_FTABLE_MAX; sc->vxl_ftable_timeout = VXLAN_FTABLE_TIMEOUT; } static int vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp) { #ifndef INET if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR4 | VXLAN_PARAM_WITH_REMOTE_ADDR4)) return (EAFNOSUPPORT); #endif #ifndef INET6 if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR6 | VXLAN_PARAM_WITH_REMOTE_ADDR6)) return (EAFNOSUPPORT); #endif if (vxlp->vxlp_with & VXLAN_PARAM_WITH_VNI) { if (vxlan_check_vni(vxlp->vxlp_vni) == 0) sc->vxl_vni = vxlp->vxlp_vni; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR4) { sc->vxl_src_addr.in4.sin_len = sizeof(struct sockaddr_in); sc->vxl_src_addr.in4.sin_family = AF_INET; sc->vxl_src_addr.in4.sin_addr = vxlp->vxlp_local_in4; } else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) { sc->vxl_src_addr.in6.sin6_len = sizeof(struct sockaddr_in6); sc->vxl_src_addr.in6.sin6_family = AF_INET6; sc->vxl_src_addr.in6.sin6_addr = vxlp->vxlp_local_in6; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR4) { sc->vxl_dst_addr.in4.sin_len = sizeof(struct sockaddr_in); sc->vxl_dst_addr.in4.sin_family = AF_INET; sc->vxl_dst_addr.in4.sin_addr = vxlp->vxlp_remote_in4; } else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) { sc->vxl_dst_addr.in6.sin6_len = sizeof(struct sockaddr_in6); sc->vxl_dst_addr.in6.sin6_family = AF_INET6; sc->vxl_dst_addr.in6.sin6_addr = vxlp->vxlp_remote_in6; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_PORT) sc->vxl_src_addr.in4.sin_port = htons(vxlp->vxlp_local_port); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_PORT) sc->vxl_dst_addr.in4.sin_port = htons(vxlp->vxlp_remote_port); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_PORT_RANGE) { if (vxlp->vxlp_min_port <= vxlp->vxlp_max_port) { sc->vxl_min_port = vxlp->vxlp_min_port; sc->vxl_max_port = vxlp->vxlp_max_port; } } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_MULTICAST_IF) strlcpy(sc->vxl_mc_ifname, vxlp->vxlp_mc_ifname, IFNAMSIZ); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_TIMEOUT) { if (vxlan_check_ftable_timeout(vxlp->vxlp_ftable_timeout) == 0) sc->vxl_ftable_timeout = vxlp->vxlp_ftable_timeout; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_MAX) { if (vxlan_check_ftable_max(vxlp->vxlp_ftable_max) == 0) sc->vxl_ftable_max = vxlp->vxlp_ftable_max; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_TTL) { if (vxlan_check_ttl(vxlp->vxlp_ttl) == 0) sc->vxl_ttl = vxlp->vxlp_ttl; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LEARN) { if (vxlp->vxlp_learn == 0) sc->vxl_flags &= ~VXLAN_FLAG_LEARN; } return (0); } static int vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct vxlan_softc *sc; struct ifnet *ifp; struct ifvxlanparam vxlp; int error; sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO); sc->vxl_unit = unit; vxlan_set_default_config(sc); if (params != 0) { error = copyin(params, &vxlp, sizeof(vxlp)); if (error) goto fail; error = vxlan_set_user_config(sc, &vxlp); if (error) goto fail; } ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { error = ENOSPC; goto fail; } sc->vxl_ifp = ifp; rm_init(&sc->vxl_lock, "vxlanrm"); callout_init_rw(&sc->vxl_callout, &sc->vxl_lock, 0); sc->vxl_port_hash_key = arc4random(); vxlan_ftable_init(sc); vxlan_sysctl_setup(sc); ifp->if_softc = sc; if_initname(ifp, vxlan_name, unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = vxlan_init; ifp->if_ioctl = vxlan_ioctl; ifp->if_transmit = vxlan_transmit; ifp->if_qflush = vxlan_qflush; vxlan_fakeaddr(sc); ether_ifattach(ifp, sc->vxl_hwaddr); ifp->if_baudrate = 0; ifp->if_hdrlen = 0; return (0); fail: free(sc, M_VXLAN); return (error); } static void vxlan_clone_destroy(struct ifnet *ifp) { struct vxlan_softc *sc; sc = ifp->if_softc; vxlan_teardown(sc); vxlan_ftable_flush(sc, 1); ether_ifdetach(ifp); if_free(ifp); vxlan_ftable_fini(sc); vxlan_sysctl_destroy(sc); rm_destroy(&sc->vxl_lock); free(sc, M_VXLAN); } /* BMV: Taken from if_bridge. */ static uint32_t vxlan_mac_hash(struct vxlan_softc *sc, const uint8_t *addr) { uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->vxl_ftable_hash_key; b += addr[5] << 8; b += addr[4]; a += addr[3] << 24; a += addr[2] << 16; a += addr[1] << 8; a += addr[0]; /* * The following hash function is adapted from "Hash Functions" by Bob Jenkins * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). */ #define mix(a, b, c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (0) mix(a, b, c); #undef mix return (c); } static void vxlan_fakeaddr(struct vxlan_softc *sc) { /* * Generate a non-multicast, locally administered address. * * BMV: Should we use the FreeBSD OUI range instead? */ arc4rand(sc->vxl_hwaddr, ETHER_ADDR_LEN, 1); sc->vxl_hwaddr[0] &= ~1; sc->vxl_hwaddr[0] |= 2; } static int vxlan_sockaddr_cmp(const union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { return (bcmp(&vxladdr->sa, sa, vxladdr->sa.sa_len)); } static void vxlan_sockaddr_copy(union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6); bzero(vxladdr, sizeof(*vxladdr)); if (sa->sa_family == AF_INET) { vxladdr->in4 = *satoconstsin(sa); vxladdr->in4.sin_len = sizeof(struct sockaddr_in); } else if (sa->sa_family == AF_INET6) { vxladdr->in6 = *satoconstsin6(sa); vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6); } } static int vxlan_sockaddr_in_equal(const union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { int equal; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; equal = in4->s_addr == vxladdr->in4.sin_addr.s_addr; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; equal = IN6_ARE_ADDR_EQUAL(in6, &vxladdr->in6.sin6_addr); } else equal = 0; return (equal); } static void vxlan_sockaddr_in_copy(union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6); if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; vxladdr->in4.sin_family = AF_INET; vxladdr->in4.sin_len = sizeof(struct sockaddr_in); vxladdr->in4.sin_addr = *in4; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; vxladdr->in6.sin6_family = AF_INET6; vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6); vxladdr->in6.sin6_addr = *in6; } } static int vxlan_sockaddr_supported(const union vxlan_sockaddr *vxladdr, int unspec) { const struct sockaddr *sa; int supported; sa = &vxladdr->sa; supported = 0; if (sa->sa_family == AF_UNSPEC && unspec != 0) { supported = 1; } else if (sa->sa_family == AF_INET) { #ifdef INET supported = 1; #endif } else if (sa->sa_family == AF_INET6) { #ifdef INET6 supported = 1; #endif } return (supported); } static int vxlan_sockaddr_in_any(const union vxlan_sockaddr *vxladdr) { const struct sockaddr *sa; int any; sa = &vxladdr->sa; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; any = in4->s_addr == INADDR_ANY; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; any = IN6_IS_ADDR_UNSPECIFIED(in6); } else any = -1; return (any); } static int vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *vxladdr) { const struct sockaddr *sa; int mc; sa = &vxladdr->sa; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; mc = IN_MULTICAST(ntohl(in4->s_addr)); } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; mc = IN6_IS_ADDR_MULTICAST(in6); } else mc = -1; return (mc); } static int vxlan_can_change_config(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; VXLAN_LOCK_ASSERT(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return (0); if (sc->vxl_flags & (VXLAN_FLAG_INIT | VXLAN_FLAG_TEARDOWN)) return (0); return (1); } static int vxlan_check_vni(uint32_t vni) { return (vni >= VXLAN_VNI_MAX); } static int vxlan_check_ttl(int ttl) { return (ttl > MAXTTL); } static int vxlan_check_ftable_timeout(uint32_t timeout) { return (timeout > VXLAN_FTABLE_MAX_TIMEOUT); } static int vxlan_check_ftable_max(uint32_t max) { return (max > VXLAN_FTABLE_MAX); } static void vxlan_sysctl_setup(struct vxlan_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *node; struct vxlan_statistics *stats; char namebuf[8]; ctx = &sc->vxl_sysctl_ctx; stats = &sc->vxl_stats; snprintf(namebuf, sizeof(namebuf), "%d", sc->vxl_unit); sysctl_ctx_init(ctx); sc->vxl_sysctl_node = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_net_link_vxlan), OID_AUTO, namebuf, CTLFLAG_RD, NULL, ""); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node), OID_AUTO, "ftable", CTLFLAG_RD, NULL, ""); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "count", CTLFLAG_RD, &sc->vxl_ftable_cnt, 0, "Number of entries in fowarding table"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "max", CTLFLAG_RD, &sc->vxl_ftable_max, 0, "Maximum number of entries allowed in fowarding table"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "timeout", CTLFLAG_RD, &sc->vxl_ftable_timeout, 0, "Number of seconds between prunes of the forwarding table"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "dump", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, sc, 0, vxlan_ftable_sysctl_dump, "A", "Dump the forwarding table entries"); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node), OID_AUTO, "stats", CTLFLAG_RD, NULL, ""); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "ftable_nospace", CTLFLAG_RD, &stats->ftable_nospace, 0, "Fowarding table reached maximum entries"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "ftable_lock_upgrade_failed", CTLFLAG_RD, &stats->ftable_lock_upgrade_failed, 0, "Forwarding table update required lock upgrade"); } static void vxlan_sysctl_destroy(struct vxlan_softc *sc) { sysctl_ctx_free(&sc->vxl_sysctl_ctx); sc->vxl_sysctl_node = NULL; } static int vxlan_tunable_int(struct vxlan_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "net.link.vxlan.%d.%s", sc->vxl_unit, knob); TUNABLE_INT_FETCH(path, &def); return (def); } static void vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp) { struct vxlan_softc_head list; struct vxlan_socket *vso; struct vxlan_softc *sc, *tsc; LIST_INIT(&list); if (ifp->if_flags & IFF_RENAMING) return; if ((ifp->if_flags & IFF_MULTICAST) == 0) return; mtx_lock(&vxlan_list_mtx); LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) vxlan_socket_ifdetach(vso, ifp, &list); mtx_unlock(&vxlan_list_mtx); LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) { LIST_REMOVE(sc, vxl_ifdetach_list); VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_INIT) vxlan_init_wait(sc); vxlan_teardown_locked(sc); } } static void vxlan_load(void) { mtx_init(&vxlan_list_mtx, "vxlan list", NULL, MTX_DEF); LIST_INIT(&vxlan_socket_list); vxlan_ifdetach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vxlan_ifdetach_event, NULL, EVENTHANDLER_PRI_ANY); vxlan_cloner = if_clone_simple(vxlan_name, vxlan_clone_create, vxlan_clone_destroy, 0); } static void vxlan_unload(void) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, vxlan_ifdetach_event_tag); if_clone_detach(vxlan_cloner); mtx_destroy(&vxlan_list_mtx); MPASS(LIST_EMPTY(&vxlan_socket_list)); } static int vxlan_modevent(module_t mod, int type, void *unused) { int error; error = 0; switch (type) { case MOD_LOAD: vxlan_load(); break; case MOD_UNLOAD: vxlan_unload(); break; default: error = ENOTSUP; break; } return (error); } static moduledata_t vxlan_mod = { "if_vxlan", vxlan_modevent, 0 }; DECLARE_MODULE(if_vxlan, vxlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vxlan, 1); Index: head/sys/net/netisr.c =================================================================== --- head/sys/net/netisr.c (revision 275357) +++ head/sys/net/netisr.c (revision 275358) @@ -1,1368 +1,1369 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract * to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * netisr is a packet dispatch service, allowing synchronous (directly * dispatched) and asynchronous (deferred dispatch) processing of packets by * registered protocol handlers. Callers pass a protocol identifier and * packet to netisr, along with a direct dispatch hint, and work will either * be immediately processed by the registered handler, or passed to a * software interrupt (SWI) thread for deferred dispatch. Callers will * generally select one or the other based on: * * - Whether directly dispatching a netisr handler lead to code reentrance or * lock recursion, such as entering the socket code from the socket code. * - Whether directly dispatching a netisr handler lead to recursive * processing, such as when decapsulating several wrapped layers of tunnel * information (IPSEC within IPSEC within ...). * * Maintaining ordering for protocol streams is a critical design concern. * Enforcing ordering limits the opportunity for concurrency, but maintains * the strong ordering requirements found in some protocols, such as TCP. Of * related concern is CPU affinity--it is desirable to process all data * associated with a particular stream on the same CPU over time in order to * avoid acquiring locks associated with the connection on different CPUs, * keep connection data in one cache, and to generally encourage associated * user threads to live on the same CPU as the stream. It's also desirable * to avoid lock migration and contention where locks are associated with * more than one flow. * * netisr supports several policy variations, represented by the * NETISR_POLICY_* constants, allowing protocols to play various roles in * identifying flows, assigning work to CPUs, etc. These are described in * netisr.h. */ #include "opt_ddb.h" #include "opt_device_polling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #define _WANT_NETISR_INTERNAL /* Enable definitions from netisr_internal.h */ #include #include #include #include #include /*- * Synchronize use and modification of the registered netisr data structures; * acquire a read lock while modifying the set of registered protocols to * prevent partially registered or unregistered protocols from being run. * * The following data structures and fields are protected by this lock: * * - The netisr_proto array, including all fields of struct netisr_proto. * - The nws array, including all fields of struct netisr_worker. * - The nws_array array. * * Note: the NETISR_LOCKING define controls whether read locks are acquired * in packet processing paths requiring netisr registration stability. This * is disabled by default as it can lead to measurable performance * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and * because netisr registration and unregistration is extremely rare at * runtime. If it becomes more common, this decision should be revisited. * * XXXRW: rmlocks don't support assertions. */ static struct rmlock netisr_rmlock; #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ RM_NOWITNESS) #define NETISR_LOCK_ASSERT() #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) /* #define NETISR_LOCKING */ static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); /*- * Three global direct dispatch policies are supported: * * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of * context (may be overriden by protocols). * * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch, * and we're running on the CPU the work would be performed on, then direct * dispatch it if it wouldn't violate ordering constraints on the workstream. * * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch, * always direct dispatch. (The default.) * * Notice that changing the global policy could lead to short periods of * misordered processing, but this is considered acceptable as compared to * the complexity of enforcing ordering during policy changes. Protocols can * override the global policy (when they're not doing that, they select * NETISR_DISPATCH_DEFAULT). */ #define NETISR_DISPATCH_POLICY_DEFAULT NETISR_DISPATCH_DIRECT #define NETISR_DISPATCH_POLICY_MAXSTR 20 /* Used for temporary buffers. */ static u_int netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT; static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN, 0, 0, sysctl_netisr_dispatch_policy, "A", "netisr dispatch policy"); /* * Allow the administrator to limit the number of threads (CPUs) to use for * netisr. We don't check netisr_maxthreads before creating the thread for * CPU 0, so in practice we ignore values <= 1. This must be set at boot. * We will create at most one thread per CPU. */ static int netisr_maxthreads = -1; /* Max number of threads. */ SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); /* * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, * both for initial configuration and later modification using * netisr_setqlimit(). */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN, &netisr_maxqlimit, 0, "Maximum netisr per-protocol, per-CPU queue depth."); /* * The default per-workstream mbuf queue limit for protocols that don't * initialize the nh_qlimit field of their struct netisr_handler. If this is * set above netisr_maxqlimit, we truncate it to the maximum during boot. */ #define NETISR_DEFAULT_DEFAULTQLIMIT 256 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN, &netisr_defaultqlimit, 0, "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); /* * Store and export the compile-time constant NETISR_MAXPROT limit on the * number of protocols that can register with netisr at a time. This is * required for crashdump analysis, as it sizes netisr_proto[]. */ static u_int netisr_maxprot = NETISR_MAXPROT; SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD, &netisr_maxprot, 0, "Compile-time limit on the number of protocols supported by netisr."); /* * The netisr_proto array describes all registered protocols, indexed by * protocol number. See netisr_internal.h for more details. */ static struct netisr_proto netisr_proto[NETISR_MAXPROT]; /* * Per-CPU workstream data. See netisr_internal.h for more details. */ DPCPU_DEFINE(struct netisr_workstream, nws); /* * Map contiguous values between 0 and nws_count into CPU IDs appropriate for * accessing workstreams. This allows constructions of the form * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). */ static u_int nws_array[MAXCPU]; /* * Number of registered workstreams. Will be at most the number of running * CPUs once fully started. */ static u_int nws_count; SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, &nws_count, 0, "Number of extant netisr threads."); /* * Synchronization for each workstream: a mutex protects all mutable fields * in each stream, including per-protocol state (mbuf queues). The SWI is * woken up if asynchronous dispatch is required. */ #define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) #define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) #define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) #define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) /* * Utility routines for protocols that implement their own mapping of flows * to CPUs. */ u_int netisr_get_cpucount(void) { return (nws_count); } u_int netisr_get_cpuid(u_int cpunumber) { KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber, nws_count)); return (nws_array[cpunumber]); } /* * The default implementation of flow -> CPU ID mapping. * * Non-static so that protocols can use it to map their own work to specific * CPUs in a manner consistent to netisr for affinity purposes. */ u_int netisr_default_flow2cpu(u_int flowid) { return (nws_array[flowid % nws_count]); } /* * Dispatch tunable and sysctl configuration. */ struct netisr_dispatch_table_entry { u_int ndte_policy; const char *ndte_policy_str; }; static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = { { NETISR_DISPATCH_DEFAULT, "default" }, { NETISR_DISPATCH_DEFERRED, "deferred" }, { NETISR_DISPATCH_HYBRID, "hybrid" }, { NETISR_DISPATCH_DIRECT, "direct" }, }; static const u_int netisr_dispatch_table_len = (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0])); static void netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer, u_int buflen) { const struct netisr_dispatch_table_entry *ndtep; const char *str; u_int i; str = "unknown"; for (i = 0; i < netisr_dispatch_table_len; i++) { ndtep = &netisr_dispatch_table[i]; if (ndtep->ndte_policy == dispatch_policy) { str = ndtep->ndte_policy_str; break; } } snprintf(buffer, buflen, "%s", str); } static int netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp) { const struct netisr_dispatch_table_entry *ndtep; u_int i; for (i = 0; i < netisr_dispatch_table_len; i++) { ndtep = &netisr_dispatch_table[i]; if (strcmp(ndtep->ndte_policy_str, str) == 0) { *dispatch_policyp = ndtep->ndte_policy; return (0); } } return (EINVAL); } static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) { char tmp[NETISR_DISPATCH_POLICY_MAXSTR]; u_int dispatch_policy; int error; netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp, sizeof(tmp)); error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req); if (error == 0 && req->newptr != NULL) { error = netisr_dispatch_policy_from_str(tmp, &dispatch_policy); if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT) error = EINVAL; if (error == 0) netisr_dispatch_policy = dispatch_policy; } return (error); } /* * Register a new netisr handler, which requires initializing per-protocol * fields for each workstream. All netisr work is briefly suspended while * the protocol is installed. */ void netisr_register(const struct netisr_handler *nhp) { struct netisr_work *npwp; const char *name; u_int i, proto; proto = nhp->nh_proto; name = nhp->nh_name; /* * Test that the requested registration is valid. */ KASSERT(nhp->nh_name != NULL, ("%s: nh_name NULL for %u", __func__, proto)); KASSERT(nhp->nh_handler != NULL, ("%s: nh_handler NULL for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_policy == NETISR_POLICY_CPU, ("%s: unsupported nh_policy %u for %s", __func__, nhp->nh_policy, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_m2flow == NULL, ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, name)); KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, name)); KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT || nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED || nhp->nh_dispatch == NETISR_DISPATCH_HYBRID || nhp->nh_dispatch == NETISR_DISPATCH_DIRECT, ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u, %s): protocol too big", __func__, proto, name)); /* * Test that no existing registration exists for this protocol. */ NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_name == NULL, ("%s(%u, %s): name present", __func__, proto, name)); KASSERT(netisr_proto[proto].np_handler == NULL, ("%s(%u, %s): handler present", __func__, proto, name)); netisr_proto[proto].np_name = name; netisr_proto[proto].np_handler = nhp->nh_handler; netisr_proto[proto].np_m2flow = nhp->nh_m2flow; netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid; netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu; if (nhp->nh_qlimit == 0) netisr_proto[proto].np_qlimit = netisr_defaultqlimit; else if (nhp->nh_qlimit > netisr_maxqlimit) { printf("%s: %s requested queue limit %u capped to " "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, netisr_maxqlimit); netisr_proto[proto].np_qlimit = netisr_maxqlimit; } else netisr_proto[proto].np_qlimit = nhp->nh_qlimit; netisr_proto[proto].np_policy = nhp->nh_policy; netisr_proto[proto].np_dispatch = nhp->nh_dispatch; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; bzero(npwp, sizeof(*npwp)); npwp->nw_qlimit = netisr_proto[proto].np_qlimit; } NETISR_WUNLOCK(); } /* * Clear drop counters across all workstreams for a protocol. */ void netisr_clearqdrops(const struct netisr_handler *nhp) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qdrops = 0; } NETISR_WUNLOCK(); } /* * Query current drop counters across all workstreams for a protocol. */ void netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) { struct netisr_work *npwp; struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int i, proto; *qdropp = 0; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; *qdropp += npwp->nw_qdrops; } NETISR_RUNLOCK(&tracker); } /* * Query current per-workstream queue limit for a protocol. */ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) { struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); *qlimitp = netisr_proto[proto].np_qlimit; NETISR_RUNLOCK(&tracker); } /* * Update the queue limit across per-workstream queues for a protocol. We * simply change the limits, and don't drain overflowed packets as they will * (hopefully) take care of themselves shortly. */ int netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; if (qlimit > netisr_maxqlimit) return (EINVAL); proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); netisr_proto[proto].np_qlimit = qlimit; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qlimit = qlimit; } NETISR_WUNLOCK(); return (0); } /* * Drain all packets currently held in a particular protocol work queue. */ static void netisr_drain_proto(struct netisr_work *npwp) { struct mbuf *m; /* * We would assert the lock on the workstream but it's not passed in. */ while ((m = npwp->nw_head) != NULL) { npwp->nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (npwp->nw_head == NULL) npwp->nw_tail = NULL; npwp->nw_len--; m_freem(m); } KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); } /* * Remove the registration of a network protocol, which requires clearing * per-protocol fields across all workstreams, including freeing all mbufs in * the queues at time of unregister. All work in netisr is briefly suspended * while this takes place. */ void netisr_unregister(const struct netisr_handler *nhp) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); netisr_proto[proto].np_name = NULL; netisr_proto[proto].np_handler = NULL; netisr_proto[proto].np_m2flow = NULL; netisr_proto[proto].np_m2cpuid = NULL; netisr_proto[proto].np_qlimit = 0; netisr_proto[proto].np_policy = 0; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; netisr_drain_proto(npwp); bzero(npwp, sizeof(*npwp)); } NETISR_WUNLOCK(); } /* * Compose the global and per-protocol policies on dispatch, and return the * dispatch policy to use. */ static u_int netisr_get_dispatch(struct netisr_proto *npp) { /* * Protocol-specific configuration overrides the global default. */ if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT) return (npp->np_dispatch); return (netisr_dispatch_policy); } /* * Look up the workstream given a packet and source identifier. Do this by * checking the protocol's policy, and optionally call out to the protocol * for assistance if required. */ static struct mbuf * netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, uintptr_t source, struct mbuf *m, u_int *cpuidp) { struct ifnet *ifp; u_int policy; NETISR_LOCK_ASSERT(); /* * In the event we have only one worker, shortcut and deliver to it * without further ado. */ if (nws_count == 1) { *cpuidp = nws_array[0]; return (m); } /* * What happens next depends on the policy selected by the protocol. * If we want to support per-interface policies, we should do that * here first. */ policy = npp->np_policy; if (policy == NETISR_POLICY_CPU) { m = npp->np_m2cpuid(m, source, cpuidp); if (m == NULL) return (NULL); /* * It's possible for a protocol not to have a good idea about * where to process a packet, in which case we fall back on * the netisr code to decide. In the hybrid case, return the * current CPU ID, which will force an immediate direct * dispatch. In the queued case, fall back on the SOURCE * policy. */ if (*cpuidp != NETISR_CPUID_NONE) return (m); if (dispatch_policy == NETISR_DISPATCH_HYBRID) { *cpuidp = curcpu; return (m); } policy = NETISR_POLICY_SOURCE; } if (policy == NETISR_POLICY_FLOW) { - if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) { + if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE && + npp->np_m2flow != NULL) { m = npp->np_m2flow(m, source); if (m == NULL) return (NULL); } - if (m->m_flags & M_FLOWID) { + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { *cpuidp = netisr_default_flow2cpu(m->m_pkthdr.flowid); return (m); } policy = NETISR_POLICY_SOURCE; } KASSERT(policy == NETISR_POLICY_SOURCE, ("%s: invalid policy %u for %s", __func__, npp->np_policy, npp->np_name)); ifp = m->m_pkthdr.rcvif; if (ifp != NULL) *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; else *cpuidp = nws_array[source % nws_count]; return (m); } /* * Process packets associated with a workstream and protocol. For reasons of * fairness, we process up to one complete netisr queue at a time, moving the * queue to a stack-local queue for processing, but do not loop refreshing * from the global queue. The caller is responsible for deciding whether to * loop, and for setting the NWS_RUNNING flag. The passed workstream will be * locked on entry and relocked before return, but will be released while * processing. The number of packets processed is returned. */ static u_int netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) { struct netisr_work local_npw, *npwp; u_int handled; struct mbuf *m; NETISR_LOCK_ASSERT(); NWS_LOCK_ASSERT(nwsp); KASSERT(nwsp->nws_flags & NWS_RUNNING, ("%s(%u): not running", __func__, proto)); KASSERT(proto >= 0 && proto < NETISR_MAXPROT, ("%s(%u): invalid proto\n", __func__, proto)); npwp = &nwsp->nws_work[proto]; if (npwp->nw_len == 0) return (0); /* * Move the global work queue to a thread-local work queue. * * Notice that this means the effective maximum length of the queue * is actually twice that of the maximum queue length specified in * the protocol registration call. */ handled = npwp->nw_len; local_npw = *npwp; npwp->nw_head = NULL; npwp->nw_tail = NULL; npwp->nw_len = 0; nwsp->nws_pendingbits &= ~(1 << proto); NWS_UNLOCK(nwsp); while ((m = local_npw.nw_head) != NULL) { local_npw.nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (local_npw.nw_head == NULL) local_npw.nw_tail = NULL; local_npw.nw_len--; VNET_ASSERT(m->m_pkthdr.rcvif != NULL, ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m)); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); netisr_proto[proto].np_handler(m); CURVNET_RESTORE(); } KASSERT(local_npw.nw_len == 0, ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); if (netisr_proto[proto].np_drainedcpu) netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu); NWS_LOCK(nwsp); npwp->nw_handled += handled; return (handled); } /* * SWI handler for netisr -- processes packets in a set of workstreams that * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already * being direct dispatched, go back to sleep and wait for the dispatching * thread to wake us up again. */ static void swi_net(void *arg) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; u_int bits, prot; nwsp = arg; #ifdef DEVICE_POLLING KASSERT(nws_count == 1, ("%s: device_polling but nws_count != 1", __func__)); netisr_poll(); #endif #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif NWS_LOCK(nwsp); KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); if (nwsp->nws_flags & NWS_DISPATCHING) goto out; nwsp->nws_flags |= NWS_RUNNING; nwsp->nws_flags &= ~NWS_SCHEDULED; while ((bits = nwsp->nws_pendingbits) != 0) { while ((prot = ffs(bits)) != 0) { prot--; bits &= ~(1 << prot); (void)netisr_process_workstream_proto(nwsp, prot); } } nwsp->nws_flags &= ~NWS_RUNNING; out: NWS_UNLOCK(nwsp); #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif #ifdef DEVICE_POLLING netisr_pollmore(); #endif } static int netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, struct netisr_work *npwp, struct mbuf *m, int *dosignalp) { NWS_LOCK_ASSERT(nwsp); *dosignalp = 0; if (npwp->nw_len < npwp->nw_qlimit) { m->m_nextpkt = NULL; if (npwp->nw_head == NULL) { npwp->nw_head = m; npwp->nw_tail = m; } else { npwp->nw_tail->m_nextpkt = m; npwp->nw_tail = m; } npwp->nw_len++; if (npwp->nw_len > npwp->nw_watermark) npwp->nw_watermark = npwp->nw_len; /* * We must set the bit regardless of NWS_RUNNING, so that * swi_net() keeps calling netisr_process_workstream_proto(). */ nwsp->nws_pendingbits |= (1 << proto); if (!(nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { nwsp->nws_flags |= NWS_SCHEDULED; *dosignalp = 1; /* Defer until unlocked. */ } npwp->nw_queued++; return (0); } else { m_freem(m); npwp->nw_qdrops++; return (ENOBUFS); } } static int netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) { struct netisr_workstream *nwsp; struct netisr_work *npwp; int dosignal, error; #ifdef NETISR_LOCKING NETISR_LOCK_ASSERT(); #endif KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, cpuid, mp_maxid)); KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); dosignal = 0; error = 0; nwsp = DPCPU_ID_PTR(cpuid, nws); npwp = &nwsp->nws_work[proto]; NWS_LOCK(nwsp); error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); return (error); } int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif u_int cpuid; int error; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED, source, m, &cpuid); if (m != NULL) { KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); error = netisr_queue_internal(proto, m, cpuid); } else error = ENOBUFS; #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_queue(u_int proto, struct mbuf *m) { return (netisr_queue_src(proto, 0, m)); } /* * Dispatch a packet for netisr processing; direct dispatch is permitted by * calling context. */ int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *npwp; int dosignal, error; u_int cpuid, dispatch_policy; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif npp = &netisr_proto[proto]; KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); dispatch_policy = netisr_get_dispatch(npp); if (dispatch_policy == NETISR_DISPATCH_DEFERRED) return (netisr_queue_src(proto, source, m)); /* * If direct dispatch is forced, then unconditionally dispatch * without a formal CPU selection. Borrow the current CPU's stats, * even if there's no worker on it. In this case we don't update * nws_flags because all netisr processing will be source ordered due * to always being forced to directly dispatch. */ if (dispatch_policy == NETISR_DISPATCH_DIRECT) { nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; npwp->nw_dispatched++; npwp->nw_handled++; netisr_proto[proto].np_handler(m); error = 0; goto out_unlock; } KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID, ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy)); /* * Otherwise, we execute in a hybrid mode where we will try to direct * dispatch if we're on the right CPU and the netisr worker isn't * already running. */ sched_pin(); m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID, source, m, &cpuid); if (m == NULL) { error = ENOBUFS; goto out_unpin; } KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); if (cpuid != curcpu) goto queue_fallback; nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; /*- * We are willing to direct dispatch only if three conditions hold: * * (1) The netisr worker isn't already running, * (2) Another thread isn't already directly dispatching, and * (3) The netisr hasn't already been woken up. */ NWS_LOCK(nwsp); if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); goto out_unpin; } /* * The current thread is now effectively the netisr worker, so set * the dispatching flag to prevent concurrent processing of the * stream from another thread (even the netisr worker), which could * otherwise lead to effective misordering of the stream. */ nwsp->nws_flags |= NWS_DISPATCHING; NWS_UNLOCK(nwsp); netisr_proto[proto].np_handler(m); NWS_LOCK(nwsp); nwsp->nws_flags &= ~NWS_DISPATCHING; npwp->nw_handled++; npwp->nw_hybrid_dispatched++; /* * If other work was enqueued by another thread while we were direct * dispatching, we need to signal the netisr worker to do that work. * In the future, we might want to do some of that work in the * current thread, rather than trigger further context switches. If * so, we'll want to establish a reasonable bound on the work done in * the "borrowed" context. */ if (nwsp->nws_pendingbits != 0) { nwsp->nws_flags |= NWS_SCHEDULED; dosignal = 1; } else dosignal = 0; NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); error = 0; goto out_unpin; queue_fallback: error = netisr_queue_internal(proto, m, cpuid); out_unpin: sched_unpin(); out_unlock: #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_dispatch(u_int proto, struct mbuf *m) { return (netisr_dispatch_src(proto, 0, m)); } #ifdef DEVICE_POLLING /* * Kernel polling borrows a netisr thread to run interface polling in; this * function allows kernel polling to request that the netisr thread be * scheduled even if no packets are pending for protocols. */ void netisr_sched_poll(void) { struct netisr_workstream *nwsp; nwsp = DPCPU_ID_PTR(nws_array[0], nws); NWS_SIGNAL(nwsp); } #endif static void netisr_start_swi(u_int cpuid, struct pcpu *pc) { char swiname[12]; struct netisr_workstream *nwsp; int error; KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); nwsp = DPCPU_ID_PTR(cpuid, nws); mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); nwsp->nws_cpu = cpuid; snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); if (error) panic("%s: swi_add %d", __func__, error); pc->pc_netisr = nwsp->nws_intr_event; if (netisr_bindthreads) { error = intr_event_bind(nwsp->nws_intr_event, cpuid); if (error != 0) printf("%s: cpu %u: intr_event_bind: %d", __func__, cpuid, error); } NETISR_WLOCK(); nws_array[nws_count] = nwsp->nws_cpu; nws_count++; NETISR_WUNLOCK(); } /* * Initialize the netisr subsystem. We rely on BSS and static initialization * of most fields in global data structures. * * Start a worker thread for the boot CPU so that we can support network * traffic immediately in case the network stack is used before additional * CPUs are started (for example, diskless boot). */ static void netisr_init(void *arg) { KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); NETISR_LOCK_INIT(); if (netisr_maxthreads < 1) netisr_maxthreads = 1; if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); netisr_maxthreads = mp_ncpus; } if (netisr_defaultqlimit > netisr_maxqlimit) { printf("netisr_init: forcing defaultqlimit from %d to %d\n", netisr_defaultqlimit, netisr_maxqlimit); netisr_defaultqlimit = netisr_maxqlimit; } #ifdef DEVICE_POLLING /* * The device polling code is not yet aware of how to deal with * multiple netisr threads, so for the time being compiling in device * polling disables parallel netisr workers. */ if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { printf("netisr_init: forcing maxthreads to 1 and " "bindthreads to 0 for device polling\n"); netisr_maxthreads = 1; netisr_bindthreads = 0; } #endif netisr_start_swi(curcpu, pcpu_find(curcpu)); } SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); /* * Start worker threads for additional CPUs. No attempt to gracefully handle * work reassignment, we don't yet support dynamic reconfiguration. */ static void netisr_start(void *arg) { struct pcpu *pc; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (nws_count >= netisr_maxthreads) break; /* XXXRW: Is skipping absent CPUs still required here? */ if (CPU_ABSENT(pc->pc_cpuid)) continue; /* Worker will already be present for boot CPU. */ if (pc->pc_netisr != NULL) continue; netisr_start_swi(pc->pc_cpuid, pc); } } SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); /* * Sysctl monitoring for netisr: query a list of registered protocols. */ static int sysctl_netisr_proto(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_proto *snpp, *snp_array; struct netisr_proto *npp; u_int counter, proto; int error; if (req->newptr != NULL) return (EINVAL); snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); for (proto = 0; proto < NETISR_MAXPROT; proto++) { npp = &netisr_proto[proto]; if (npp->np_name == NULL) continue; snpp = &snp_array[counter]; snpp->snp_version = sizeof(*snpp); strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN); snpp->snp_proto = proto; snpp->snp_qlimit = npp->np_qlimit; snpp->snp_policy = npp->np_policy; snpp->snp_dispatch = npp->np_dispatch; if (npp->np_m2flow != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW; if (npp->np_m2cpuid != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID; if (npp->np_drainedcpu != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU; counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= NETISR_MAXPROT, ("sysctl_netisr_proto: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter); free(snp_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, proto, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto, "S,sysctl_netisr_proto", "Return list of protocols registered with netisr"); /* * Sysctl monitoring for netisr: query a list of workstreams. */ static int sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_workstream *snwsp, *snws_array; struct netisr_workstream *nwsp; u_int counter, cpuid; int error; if (req->newptr != NULL) return (EINVAL); snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); snwsp = &snws_array[counter]; snwsp->snws_version = sizeof(*snwsp); /* * For now, we equate workstream IDs and CPU IDs in the * kernel, but expose them independently to userspace in case * that assumption changes in the future. */ snwsp->snws_wsid = cpuid; snwsp->snws_cpu = cpuid; if (nwsp->nws_intr_event != NULL) snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR; NWS_UNLOCK(nwsp); counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= MAXCPU, ("sysctl_netisr_workstream: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter); free(snws_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, workstream, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream, "S,sysctl_netisr_workstream", "Return list of workstreams implemented by netisr"); /* * Sysctl monitoring for netisr: query per-protocol data across all * workstreams. */ static int sysctl_netisr_work(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_work *snwp, *snw_array; struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *nwp; u_int counter, cpuid, proto; int error; if (req->newptr != NULL) return (EINVAL); snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); for (proto = 0; proto < NETISR_MAXPROT; proto++) { npp = &netisr_proto[proto]; if (npp->np_name == NULL) continue; nwp = &nwsp->nws_work[proto]; snwp = &snw_array[counter]; snwp->snw_version = sizeof(*snwp); snwp->snw_wsid = cpuid; /* See comment above. */ snwp->snw_proto = proto; snwp->snw_len = nwp->nw_len; snwp->snw_watermark = nwp->nw_watermark; snwp->snw_dispatched = nwp->nw_dispatched; snwp->snw_hybrid_dispatched = nwp->nw_hybrid_dispatched; snwp->snw_qdrops = nwp->nw_qdrops; snwp->snw_queued = nwp->nw_queued; snwp->snw_handled = nwp->nw_handled; counter++; } NWS_UNLOCK(nwsp); } KASSERT(counter <= MAXCPU * NETISR_MAXPROT, ("sysctl_netisr_work: counter too big (%d)", counter)); NETISR_RUNLOCK(&tracker); error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter); free(snw_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, work, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work, "S,sysctl_netisr_work", "Return list of per-workstream, per-protocol work in netisr"); #ifdef DDB DB_SHOW_COMMAND(netisr, db_show_netisr) { struct netisr_workstream *nwsp; struct netisr_work *nwp; int first, proto; u_int cpuid; db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; first = 1; for (proto = 0; proto < NETISR_MAXPROT; proto++) { if (netisr_proto[proto].np_handler == NULL) continue; nwp = &nwsp->nws_work[proto]; if (first) { db_printf("%3d ", cpuid); first = 0; } else db_printf("%3s ", ""); db_printf( "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", netisr_proto[proto].np_name, nwp->nw_len, nwp->nw_watermark, nwp->nw_qlimit, nwp->nw_dispatched, nwp->nw_hybrid_dispatched, nwp->nw_qdrops, nwp->nw_queued); } } } #endif Index: head/sys/netinet/in_pcb.h =================================================================== --- head/sys/netinet/in_pcb.h (revision 275357) +++ head/sys/netinet/in_pcb.h (revision 275358) @@ -1,679 +1,679 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ #include #include #include #include #ifdef _KERNEL #include #include #include #include #endif #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ struct inpcbpolicy; /* * struct inpcb is the common protocol control block structure used in most * IP transport protocols. * * Pointers to local and foreign host table entries, local and foreign socket * numbers, and pointers up (to a socket structure) and down (to a * protocol-specific control block) are stored here. */ LIST_HEAD(inpcbhead, inpcb); LIST_HEAD(inpcbporthead, inpcbport); typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing * the following structure. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; struct in_addr ia46_addr4; }; /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ union { /* foreign host table entry */ struct in_addr_4in6 ie46_foreign; struct in6_addr ie6_foreign; } ie_dependfaddr; union { /* local host table entry */ struct in_addr_4in6 ie46_local; struct in6_addr ie6_local; } ie_dependladdr; u_int32_t ie6_zoneid; /* scope zone id */ }; #define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 #define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 #define ie6_faddr ie_dependfaddr.ie6_foreign #define ie6_laddr ie_dependladdr.ie6_local /* * XXX The defines for inc_* are hacks and should be changed to direct * references. */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */ /* protocol dependent part */ struct in_endpoints inc_ie; }; /* * Flags for inc_flags. */ #define INC_ISIPV6 0x01 #define inc_isipv6 inc_flags /* temp compatability */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr #define inc6_zoneid inc_ie.ie6_zoneid struct icmp6_filter; /*- * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 * and IPv6 sockets. In the case of TCP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb * are static after creation or protected by a per-inpcb rwlock, inp_lock. A * few fields also require the global pcbinfo lock for the inpcb to be held, * when modified, such as the global connection lists and hashes, as well as * binding information (which affects which hash a connection is on). This * model means that connections can be looked up without holding the * per-connection lock, which is important for performance when attempting to * find the connection for a packet given its IP and port tuple. Writing to * these fields that write locks be held on both the inpcb and global locks. * * Key: * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * * A few other notes: * * When a read lock is held, stability of the field is guaranteed; to write * to a field, a write lock must generally be held. * * netinet/netinet6-layer code should not assume that the inp_socket pointer * is safe to dereference without inp_lock being held, even for protocols * other than TCP (where the inpcb persists during TIMEWAIT even after the * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ u_char inp_ip_ttl; /* (i) time to live proto */ u_char inp_ip_p; /* (c) protocol proto */ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ void *inp_pspare[5]; /* (x) route caching / general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ u_int inp_ispare[4]; /* (x) route caching / user cookie / * general use */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ struct inpcbpolicy *inp_sp; /* (s) for IPSEC */ /* Protocol-dependent part; options. */ struct { u_char inp4_ip_tos; /* (i) type of service proto */ struct mbuf *inp4_options; /* (i) IP options */ struct ip_moptions *inp4_moptions; /* (i) IP mcast options */ } inp_depend4; struct { /* (i) IP options */ struct mbuf *inp6_options; /* (i) IP6 options for outgoing packets */ struct ip6_pktopts *inp6_outputopts; /* (i) IP multicast options */ struct ip6_moptions *inp6_moptions; /* (i) ICMPv6 code type filter */ struct icmp6_filter *inp6_icmp6filt; /* (i) IPV6_CHECKSUM setsockopt */ int inp6_cksum; short inp6_hops; } inp_depend6; LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */ struct inpcbport *inp_phd; /* (i/p) head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count */ struct llentry *inp_lle; /* cached L2 information */ struct rtentry *inp_rt; /* cached L3 information */ struct rwlock inp_lock; }; #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr #define inp_ip_tos inp_depend4.inp4_ip_tos #define inp_options inp_depend4.inp4_options #define inp_moptions inp_depend4.inp4_moptions #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_zoneid inp_inc.inc6_zoneid #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ #define in6p_flowinfo inp_flow #define in6p_options inp_depend6.inp6_options #define in6p_outputopts inp_depend6.inp6_outputopts #define in6p_moptions inp_depend6.inp6_moptions #define in6p_icmp6filt inp_depend6.inp6_icmp6filt #define in6p_cksum inp_depend6.inp6_cksum #define inp_vnet inp_pcbinfo->ipi_vnet /* * The range of the generation count, as used in this implementation, is 9e19. * We would have to create 300 billion connections per second for this number * to roll over in a year. This seems sufficiently unlikely that we simply * don't concern ourselves with that possibility. */ /* * Interface exported to userland by various protocols which use inpcbs. Hack * alert -- only define if struct xsocket is in scope. */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { size_t xi_len; /* length of this structure */ struct inpcb xi_inp; struct xsocket xi_socket; u_quad_t xi_alignment_hack; }; struct xinpgen { size_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count at this time */ }; #endif /* _SYS_SOCKETVAR_H_ */ struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; u_short phd_port; }; /*- * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock, * the former covering mutable global fields (such as the global pcb list), * and the latter covering the hashed lookup tables. The lock order is: * * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (g) Locked by ipi_lock * (h) Read using either ipi_hash_lock or inpcb lock; write requires both * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* * Global lock protecting global inpcb list, inpcb count, etc. */ struct rwlock ipi_lock; /* * Global list of inpcbs on the protocol. */ struct inpcbhead *ipi_listhead; /* (g) */ u_int ipi_count; /* (g) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ u_quad_t ipi_gencnt; /* (g) */ /* * Fields associated with port lookup and allocation. */ u_short ipi_lastport; /* (x) */ u_short ipi_lastlow; /* (x) */ u_short ipi_lasthi; /* (x) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ struct uma_zone *ipi_zone; /* (c) */ /* * Connection groups associated with this protocol. These fields are * constant, but pcbgroup structures themselves are protected by * per-pcbgroup locks. */ struct inpcbgroup *ipi_pcbgroups; /* (c) */ u_int ipi_npcbgroups; /* (c) */ u_int ipi_hashfields; /* (c) */ /* * Global lock protecting non-pcbgroup hash lookup tables. */ struct rwlock ipi_hash_lock; /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ struct inpcbhead *ipi_hashbase; /* (h) */ u_long ipi_hashmask; /* (h) */ /* * Global hash of inpcbs, hashed by only local port number. */ struct inpcbporthead *ipi_porthashbase; /* (h) */ u_long ipi_porthashmask; /* (h) */ /* * List of wildcard inpcbs for use with pcbgroups. In the past, was * per-pcbgroup but is now global. All pcbgroup locks must be held * to modify the list, so any is sufficient to read it. */ struct inpcbhead *ipi_wildbase; /* (p) */ u_long ipi_wildmask; /* (p) */ /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ /* * general use 2 */ void *ipi_pspare[2]; }; #ifdef _KERNEL /* * Connection groups hold sets of connections that have similar CPU/thread * affinity. Each connection belongs to exactly one connection group. */ struct inpcbgroup { /* * Per-connection group hash of inpcbs, hashed by local and foreign * addresses and port numbers. */ struct inpcbhead *ipg_hashbase; /* (c) */ u_long ipg_hashmask; /* (c) */ /* * Notional affinity of this pcbgroup. */ u_int ipg_cpu; /* (p) */ /* * Per-connection group lock, not to be confused with ipi_lock. * Protects the hash table hung off the group, but also the global * wildcard list in inpcbinfo. */ struct mtx ipg_lock; } __aligned(CACHE_LINE_SIZE); #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) #define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock) #define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) #define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) #define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) #define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock) #define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock) #define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock) #define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) #define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED) #define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) #define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) /* * These locking functions are for inpcb consumers outside of sys/netinet, * more specifically, they were added for the benefit of TOE drivers. The * macros are reserved for use by the stack. */ void inp_wlock(struct inpcb *); void inp_wunlock(struct inpcb *); void inp_rlock(struct inpcb *); void inp_runlock(struct inpcb *); #ifdef INVARIANTS void inp_lock_assert(struct inpcb *); void inp_unlock_assert(struct inpcb *); #else static __inline void inp_lock_assert(struct inpcb *inp __unused) { } static __inline void inp_unlock_assert(struct inpcb *inp __unused) { } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg); int inp_ip_tos_get(const struct inpcb *inp); void inp_ip_tos_set(struct inpcb *inp, int val); struct socket * inp_inpcbtosocket(struct inpcb *inp); struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); short inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) #define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) #define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) #define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) #define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) #define INP_HASH_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) #define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) #define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) #define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) #define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) #define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) #define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ RA_LOCKED) #define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ RA_WLOCKED) #define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ MTX_DEF | MTX_DUPOK) #define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) #define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) #define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) #define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) #define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3]) /* * Flags for inp_vflags -- historically version flags only */ #define INP_IPV4 0x1 #define INP_IPV6 0x2 #define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ /* * Flags for inp_flags. */ #define INP_RECVOPTS 0x00000001 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */ #define INP_HDRINCL 0x00000008 /* user supplies entire IP header */ #define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */ #define INP_LOWPORT 0x00000020 /* user wants "low" port binding */ #define INP_ANONPORT 0x00000040 /* port chosen for user */ #define INP_RECVIF 0x00000080 /* receive incoming interface */ #define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ /* 0x000200 unused: was INP_FAITH */ #define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x00000800 /* don't fragment packet */ #define INP_BINDANY 0x00001000 /* allow bind to any address */ #define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */ #define INP_RECVTOS 0x00004000 /* receive incoming IP TOS */ #define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */ #define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */ #define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */ #define IN6P_RTHDR 0x00100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x00400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */ #define INP_TIMEWAIT 0x01000000 /* in TIMEWAIT, ppcb is tcptw */ #define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */ #define INP_DROPPED 0x04000000 /* protocol drop flag */ #define INP_SOCKREF 0x08000000 /* strong socket reference */ -#define INP_SW_FLOWID 0x10000000 /* software generated flow id */ -#define INP_HW_FLOWID 0x20000000 /* hardware generated flow id */ +#define INP_RESERVED_0 0x10000000 /* reserved field */ +#define INP_RESERVED_1 0x20000000 /* reserved field */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ IN6P_MTU) /* * Flags for inp_flags2. */ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ #define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ #define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ /* * Flags passed to in_pcblookup*() functions. */ #define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ #define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ #define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) /* * Constants for pcbinfo.ipi_hashfields. */ #define IPI_HASHFIELDS_NONE 0 #define IPI_HASHFIELDS_2TUPLE 1 #define IPI_HASHFIELDS_4TUPLE 2 #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); VNET_DECLARE(int, ipport_lowfirstauto); VNET_DECLARE(int, ipport_lowlastauto); VNET_DECLARE(int, ipport_firstauto); VNET_DECLARE(int, ipport_lastauto); VNET_DECLARE(int, ipport_hifirstauto); VNET_DECLARE(int, ipport_hilastauto); VNET_DECLARE(int, ipport_randomized); VNET_DECLARE(int, ipport_randomcps); VNET_DECLARE(int, ipport_randomtime); VNET_DECLARE(int, ipport_stoprandom); VNET_DECLARE(int, ipport_tcpallocs); #define V_ipport_reservedhigh VNET(ipport_reservedhigh) #define V_ipport_reservedlow VNET(ipport_reservedlow) #define V_ipport_lowfirstauto VNET(ipport_lowfirstauto) #define V_ipport_lowlastauto VNET(ipport_lowlastauto) #define V_ipport_firstauto VNET(ipport_firstauto) #define V_ipport_lastauto VNET(ipport_lastauto) #define V_ipport_hifirstauto VNET(ipport_hifirstauto) #define V_ipport_hilastauto VNET(ipport_hilastauto) #define V_ipport_randomized VNET(ipport_randomized) #define V_ipport_randomcps VNET(ipport_randomcps) #define V_ipport_randomtime VNET(ipport_randomtime) #define V_ipport_stoprandom VNET(ipport_stoprandom) #define V_ipport_tcpallocs VNET(ipport_tcpallocs) void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, int, int, char *, uma_init, uma_fini, uint32_t, u_int); int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); struct inpcbgroup * in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); struct inpcbgroup * in_pcbgroup_byinpcb(struct inpcb *); struct inpcbgroup * in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, struct in_addr, u_short); void in_pcbgroup_destroy(struct inpcbinfo *); int in_pcbgroup_enabled(struct inpcbinfo *); void in_pcbgroup_init(struct inpcbinfo *, u_int, int); void in_pcbgroup_remove(struct inpcb *); void in_pcbgroup_update(struct inpcb *); void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, struct ucred *, int); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *, struct mbuf *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); void in_pcbdetach(struct inpcb *); void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); int in_pcbinshash_nopcbgroup(struct inpcb *); int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: head/sys/netinet/in_rss.c =================================================================== --- head/sys/netinet/in_rss.c (revision 275357) +++ head/sys/netinet/in_rss.c (revision 275358) @@ -1,900 +1,899 @@ /*- * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract * to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_pcbgroup.h" #ifndef PCBGROUP #error "options RSS depends on options PCBGROUP" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for software rss hash support */ #include #include #include /*- * Operating system parts of receiver-side scaling (RSS), which allows * network cards to direct flows to particular receive queues based on hashes * of header tuples. This implementation aligns RSS buckets with connection * groups at the TCP/IP layer, so each bucket is associated with exactly one * group. As a result, the group lookup structures (and lock) should have an * effective affinity with exactly one CPU. * * Network device drivers needing to configure RSS will query this framework * for parameters, such as the current RSS key, hashing policies, number of * bits, and indirection table mapping hashes to buckets and CPUs. They may * provide their own supplementary information, such as queue<->CPU bindings. * It is the responsibility of the network device driver to inject packets * into the stack on as close to the right CPU as possible, if playing by RSS * rules. * * TODO: * * - Synchronization for rss_key and other future-configurable parameters. * - Event handler drivers can register to pick up RSS configuration changes. * - Should we allow rss_basecpu to be configured? * - Randomize key on boot. * - IPv6 support. * - Statistics on how often there's a misalignment between hardware * placement and pcbgroup expectations. */ SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering"); /* * Toeplitz is the only required hash function in the RSS spec, so use it by * default. */ static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0, "RSS hash algorithm"); /* * Size of the indirection table; at most 128 entries per the RSS spec. We * size it to at least 2 times the number of CPUs by default to allow useful * rebalancing. If not set explicitly with a loader tunable, we tune based * on the number of CPUs present. * * XXXRW: buckets might be better to use for the tunable than bits. */ static u_int rss_bits; SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0, "RSS bits"); static u_int rss_mask; SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, "RSS mask"); static const u_int rss_maxbits = RSS_MAXBITS; SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); /* * RSS's own count of the number of CPUs it could be using for processing. * Bounded to 64 by RSS constants. */ static u_int rss_ncpus; SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, "Number of CPUs available to RSS"); #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) static const u_int rss_maxcpus = RSS_MAXCPUS; SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); /* * Variable exists just for reporting rss_bits in a user-friendly way. */ static u_int rss_buckets; SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, "RSS buckets"); /* * Base CPU number; devices will add this to all CPU numbers returned by the * RSS indirection table. Currently unmodifable in FreeBSD. */ static const u_int rss_basecpu; SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); /* * RSS secret key, intended to prevent attacks on load-balancing. Its * effectiveness may be limited by algorithm choice and available entropy * during the boot. * * XXXRW: And that we don't randomize it yet! * * This is the default Microsoft RSS specification key which is also * the Chelsio T5 firmware default key. */ static uint8_t rss_key[RSS_KEYSIZE] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, }; /* * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. * Drivers may supplement this table with a seperate CPU<->queue table when * programming devices. */ struct rss_table_entry { uint8_t rte_cpu; /* CPU affinity of bucket. */ }; static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; static inline u_int rss_gethashconfig_local(void); static void rss_init(__unused void *arg) { u_int i; u_int cpuid; /* * Validate tunables, coerce to sensible values. */ switch (rss_hashalgo) { case RSS_HASH_TOEPLITZ: case RSS_HASH_NAIVE: break; default: printf("%s: invalid RSS hashalgo %u, coercing to %u", __func__, rss_hashalgo, RSS_HASH_TOEPLITZ); rss_hashalgo = RSS_HASH_TOEPLITZ; } /* * Count available CPUs. * * XXXRW: Note incorrect assumptions regarding contiguity of this set * elsewhere. */ rss_ncpus = 0; for (i = 0; i <= mp_maxid; i++) { if (CPU_ABSENT(i)) continue; rss_ncpus++; } if (rss_ncpus > RSS_MAXCPUS) rss_ncpus = RSS_MAXCPUS; /* * Tune RSS table entries to be no less than 2x the number of CPUs * -- unless we're running uniprocessor, in which case there's not * much point in having buckets to rearrange for load-balancing! */ if (rss_ncpus > 1) { if (rss_bits == 0) rss_bits = fls(rss_ncpus - 1) + 1; /* * Microsoft limits RSS table entries to 128, so apply that * limit to both auto-detected CPU counts and user-configured * ones. */ if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { printf("%s: RSS bits %u not valid, coercing to %u", __func__, rss_bits, RSS_MAXBITS); rss_bits = RSS_MAXBITS; } /* * Figure out how many buckets to use; warn if less than the * number of configured CPUs, although this is not a fatal * problem. */ rss_buckets = (1 << rss_bits); if (rss_buckets < rss_ncpus) printf("%s: WARNING: rss_buckets (%u) less than " "rss_ncpus (%u)\n", __func__, rss_buckets, rss_ncpus); rss_mask = rss_buckets - 1; } else { rss_bits = 0; rss_buckets = 1; rss_mask = 0; } /* * Set up initial CPU assignments: round-robin by default. */ cpuid = CPU_FIRST(); for (i = 0; i < rss_buckets; i++) { rss_table[i].rte_cpu = cpuid; cpuid = CPU_NEXT(cpuid); } /* * Randomize rrs_key. * * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() * loop to check for "bad" RSS keys. */ } SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); static uint32_t rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, const uint8_t *data) { uint32_t v; u_int i; v = 0; for (i = 0; i < keylen; i++) v += key[i]; for (i = 0; i < datalen; i++) v += data[i]; return (v); } static uint32_t rss_hash(u_int datalen, const uint8_t *data) { switch (rss_hashalgo) { case RSS_HASH_TOEPLITZ: return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, data)); case RSS_HASH_NAIVE: return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, data)); default: panic("%s: unsupported/unknown hashalgo %d", __func__, rss_hashalgo); } } /* * Hash an IPv4 2-tuple. */ uint32_t rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst) { uint8_t data[sizeof(src) + sizeof(dst)]; u_int datalen; datalen = 0; bcopy(&src, &data[datalen], sizeof(src)); datalen += sizeof(src); bcopy(&dst, &data[datalen], sizeof(dst)); datalen += sizeof(dst); return (rss_hash(datalen, data)); } /* * Hash an IPv4 4-tuple. */ uint32_t rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst, u_short dstport) { uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + sizeof(dstport)]; u_int datalen; datalen = 0; bcopy(&src, &data[datalen], sizeof(src)); datalen += sizeof(src); bcopy(&dst, &data[datalen], sizeof(dst)); datalen += sizeof(dst); bcopy(&srcport, &data[datalen], sizeof(srcport)); datalen += sizeof(srcport); bcopy(&dstport, &data[datalen], sizeof(dstport)); datalen += sizeof(dstport); return (rss_hash(datalen, data)); } #ifdef INET6 /* * Hash an IPv6 2-tuple. */ uint32_t rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst) { uint8_t data[sizeof(src) + sizeof(dst)]; u_int datalen; datalen = 0; bcopy(&src, &data[datalen], sizeof(src)); datalen += sizeof(src); bcopy(&dst, &data[datalen], sizeof(dst)); datalen += sizeof(dst); return (rss_hash(datalen, data)); } /* * Hash an IPv6 4-tuple. */ uint32_t rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport, struct in6_addr dst, u_short dstport) { uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + sizeof(dstport)]; u_int datalen; datalen = 0; bcopy(&src, &data[datalen], sizeof(src)); datalen += sizeof(src); bcopy(&dst, &data[datalen], sizeof(dst)); datalen += sizeof(dst); bcopy(&srcport, &data[datalen], sizeof(srcport)); datalen += sizeof(srcport); bcopy(&dstport, &data[datalen], sizeof(dstport)); datalen += sizeof(dstport); return (rss_hash(datalen, data)); } #endif /* INET6 */ /* * Query the number of RSS bits in use. */ u_int rss_getbits(void) { return (rss_bits); } /* * Query the RSS bucket associated with an RSS hash. */ u_int rss_getbucket(u_int hash) { return (hash & rss_mask); } /* * Query the RSS layer bucket associated with the given * entry in the RSS hash space. * * The RSS indirection table is 0 .. rss_buckets-1, * covering the low 'rss_bits' of the total 128 slot * RSS indirection table. So just mask off rss_bits and * return that. * * NIC drivers can then iterate over the 128 slot RSS * indirection table and fetch which RSS bucket to * map it to. This will typically be a CPU queue */ u_int rss_get_indirection_to_bucket(u_int index) { return (index & rss_mask); } /* * Query the RSS CPU associated with an RSS bucket. */ u_int rss_getcpu(u_int bucket) { return (rss_table[bucket].rte_cpu); } /* * netisr CPU affinity lookup given just the hash and hashtype. */ u_int rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type) { switch (hash_type) { case M_HASHTYPE_RSS_IPV4: case M_HASHTYPE_RSS_TCP_IPV4: case M_HASHTYPE_RSS_UDP_IPV4: case M_HASHTYPE_RSS_IPV6: case M_HASHTYPE_RSS_TCP_IPV6: case M_HASHTYPE_RSS_UDP_IPV6: return (rss_getcpu(rss_getbucket(hash_val))); default: return (NETISR_CPUID_NONE); } } /* * Query the RSS bucket associated with the given hash value and * type. */ int rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) { switch (hash_type) { case M_HASHTYPE_RSS_IPV4: case M_HASHTYPE_RSS_TCP_IPV4: case M_HASHTYPE_RSS_UDP_IPV4: case M_HASHTYPE_RSS_IPV6: case M_HASHTYPE_RSS_TCP_IPV6: case M_HASHTYPE_RSS_UDP_IPV6: *bucket_id = rss_getbucket(hash_val); return (0); default: return (-1); } } /* * netisr CPU affinity lookup routine for use by protocols. */ struct mbuf * rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) { M_ASSERTPKTHDR(m); *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); return (m); } int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) { M_ASSERTPKTHDR(m); return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), bucket_id)); } /* * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given * IPv4 source/destination address, UDP or TCP source/destination ports * and the protocol type. * * The protocol code may wish to do a software hash of the given * tuple. This depends upon the currently configured RSS hash types. * * This assumes that the packet in question isn't a fragment. * * It also assumes the packet source/destination address * are in "incoming" packet order (ie, source is "far" address.) */ int rss_proto_software_hash_v4(struct in_addr s, struct in_addr d, u_short sp, u_short dp, int proto, uint32_t *hashval, uint32_t *hashtype) { uint32_t hash; /* * Next, choose the hash type depending upon the protocol * identifier. */ if ((proto == IPPROTO_TCP) && (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) { hash = rss_hash_ip4_4tuple(s, sp, d, dp); *hashval = hash; *hashtype = M_HASHTYPE_RSS_TCP_IPV4; return (0); } else if ((proto == IPPROTO_UDP) && (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) { hash = rss_hash_ip4_4tuple(s, sp, d, dp); *hashval = hash; *hashtype = M_HASHTYPE_RSS_UDP_IPV4; return (0); } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { /* RSS doesn't hash on other protocols like SCTP; so 2-tuple */ hash = rss_hash_ip4_2tuple(s, d); *hashval = hash; *hashtype = M_HASHTYPE_RSS_IPV4; return (0); } /* No configured available hashtypes! */ printf("%s: no available hashtypes!\n", __func__); return (-1); } /* * Do a software calculation of the RSS for the given mbuf. * * This is typically used by the input path to recalculate the RSS after * some form of packet processing (eg de-capsulation, IP fragment reassembly.) * * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and * RSS_HASH_PKT_EGRESS for outgoing. * * Returns 0 if a hash was done, -1 if no hash was done, +1 if * the mbuf already had a valid RSS flowid. * * This function doesn't modify the mbuf. It's up to the caller to * assign flowid/flowtype as appropriate. */ int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval, uint32_t *hashtype) { const struct ip *ip; const struct tcphdr *th; const struct udphdr *uh; + uint32_t flowid; + uint32_t flowtype; uint8_t proto; int iphlen; int is_frag = 0; /* * XXX For now this only handles hashing on incoming mbufs. */ if (dir != RSS_HASH_PKT_INGRESS) { printf("%s: called on EGRESS packet!\n", __func__); return (-1); } /* * First, validate that the mbuf we have is long enough * to have an IPv4 header in it. */ if (m->m_pkthdr.len < (sizeof(struct ip))) { printf("%s: short mbuf pkthdr\n", __func__); return (-1); } if (m->m_len < (sizeof(struct ip))) { printf("%s: short mbuf len\n", __func__); return (-1); } /* Ok, let's dereference that */ ip = mtod(m, struct ip *); proto = ip->ip_p; iphlen = ip->ip_hl << 2; /* * If this is a fragment then it shouldn't be four-tuple * hashed just yet. Once it's reassembled into a full * frame it should be re-hashed. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) is_frag = 1; /* * If the mbuf flowid/flowtype matches the packet type, * and we don't support the 4-tuple version of the given protocol, * then signal to the owner that it can trust the flowid/flowtype * details. * * This is a little picky - eg, if TCPv4 / UDPv4 hashing * is supported but we got a TCP/UDP frame only 2-tuple hashed, * then we shouldn't just "trust" the 2-tuple hash. We need * a 4-tuple hash. */ - if (m->m_flags & M_FLOWID) { - uint32_t flowid, flowtype; + flowid = m->m_pkthdr.flowid; + flowtype = M_HASHTYPE_GET(m); - flowid = m->m_pkthdr.flowid; - flowtype = M_HASHTYPE_GET(m); - + if (flowtype != M_HASHTYPE_NONE) { switch (proto) { case IPPROTO_UDP: if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && (flowtype == M_HASHTYPE_RSS_UDP_IPV4) && (is_frag == 0)) { return (1); } /* * Only allow 2-tuple for UDP frames if we don't also * support 4-tuple for UDP. */ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) == 0) && flowtype == M_HASHTYPE_RSS_IPV4) { return (1); } break; case IPPROTO_TCP: if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && (flowtype == M_HASHTYPE_RSS_TCP_IPV4) && (is_frag == 0)) { return (1); } /* * Only allow 2-tuple for TCP frames if we don't also * support 2-tuple for TCP. */ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) && flowtype == M_HASHTYPE_RSS_IPV4) { return (1); } break; default: if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && flowtype == M_HASHTYPE_RSS_IPV4) { return (1); } break; } } /* * Decode enough information to make a hash decision. * * XXX TODO: does the hardware hash on 4-tuple if IP * options are present? */ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && (proto == IPPROTO_TCP) && (is_frag == 0)) { if (m->m_len < iphlen + sizeof(struct tcphdr)) { printf("%s: short TCP frame?\n", __func__); return (-1); } th = (struct tcphdr *)((caddr_t)ip + iphlen); return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, th->th_sport, th->th_dport, proto, hashval, hashtype); } else if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && (proto == IPPROTO_UDP) && (is_frag == 0)) { uh = (struct udphdr *)((caddr_t)ip + iphlen); if (m->m_len < iphlen + sizeof(struct udphdr)) { printf("%s: short UDP frame?\n", __func__); return (-1); } return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, uh->uh_sport, uh->uh_dport, proto, hashval, hashtype); } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { /* Default to 2-tuple hash */ return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, 0, /* source port */ 0, /* destination port */ 0, /* IPPROTO_IP */ hashval, hashtype); } else { printf("%s: no available hashtypes!\n", __func__); return (-1); } } /* * Similar to rss_m2cpuid, but designed to be used by the IP NETISR * on incoming frames. * * If an existing RSS hash exists and it matches what the configured * hashing is, then use it. * * If there's an existing RSS hash but the desired hash is different, * or if there's no useful RSS hash, then calculate it via * the software path. * * XXX TODO: definitely want statistics here! */ struct mbuf * rss_soft_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) { uint32_t hash_val, hash_type; int ret; M_ASSERTPKTHDR(m); ret = rss_mbuf_software_hash_v4(m, RSS_HASH_PKT_INGRESS, &hash_val, &hash_type); if (ret > 0) { /* mbuf has a valid hash already; don't need to modify it */ *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); } else if (ret == 0) { /* hash was done; update */ m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); - m->m_flags |= M_FLOWID; *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); } else { /* ret < 0 */ /* no hash was done */ *cpuid = NETISR_CPUID_NONE; } return (m); } /* * Query the RSS hash algorithm. */ u_int rss_gethashalgo(void) { return (rss_hashalgo); } /* * Query the current RSS key; likely to be used by device drivers when * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. * * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? */ void rss_getkey(uint8_t *key) { bcopy(rss_key, key, sizeof(rss_key)); } /* * Query the number of buckets; this may be used by both network device * drivers, which will need to populate hardware shadows of the software * indirection table, and the network stack itself (such as when deciding how * many connection groups to allocate). */ u_int rss_getnumbuckets(void) { return (rss_buckets); } /* * Query the number of CPUs in use by RSS; may be useful to device drivers * trying to figure out how to map a larger number of CPUs into a smaller * number of receive queues. */ u_int rss_getnumcpus(void) { return (rss_ncpus); } static inline u_int rss_gethashconfig_local(void) { /* Return 4-tuple for TCP; 2-tuple for others */ /* * UDP may fragment more often than TCP and thus we'll end up with * NICs returning 2-tuple fragments. * udp_init() and udplite_init() both currently initialise things * as 2-tuple. * So for now disable UDP 4-tuple hashing until all of the other * pieces are in place. */ return ( RSS_HASHTYPE_RSS_IPV4 | RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_IPV6_EX | RSS_HASHTYPE_RSS_TCP_IPV6_EX #if 0 | RSS_HASHTYPE_RSS_UDP_IPV4 | RSS_HASHTYPE_RSS_UDP_IPV4_EX | RSS_HASHTYPE_RSS_UDP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV6_EX #endif ); } /* * Return the supported RSS hash configuration. * * NICs should query this to determine what to configure in their redirection * matching table. */ u_int rss_gethashconfig(void) { return (rss_gethashconfig_local()); } /* * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want * it appearing in debugging output unnecessarily. */ static int sysctl_rss_key(SYSCTL_HANDLER_ARGS) { uint8_t temp_rss_key[RSS_KEYSIZE]; int error; error = priv_check(req->td, PRIV_NETINET_HASHKEY); if (error) return (error); bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); error = sysctl_handle_opaque(oidp, temp_rss_key, sizeof(temp_rss_key), req); if (error) return (error); if (req->newptr != NULL) { /* XXXRW: Not yet. */ return (EINVAL); } return (0); } SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, "", "RSS keying material"); static int sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error; int i; error = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < rss_buckets; i++) { sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ", i, rss_getcpu(i)); } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping"); Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c (revision 275357) +++ head/sys/netinet/ip_input.c (revision 275358) @@ -1,1872 +1,1871 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipfw.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /* IPSEC */ #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif struct rwlock in_ifaddr_lock; RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); VNET_DEFINE(int, ip_do_randomid); SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_do_randomid), 0, "Assign random ip_id values"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static VNET_DEFINE(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_m2cpuid, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ static VNET_DEFINE(uma_zone_t, ipq_zone); static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]); static struct mtx ipqlock; #define V_ipq_zone VNET(ipq_zone) #define V_ipq VNET(ipq) #define IPQ_LOCK() mtx_lock(&ipqlock) #define IPQ_UNLOCK() mtx_unlock(&ipqlock) #define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) #define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) static void maxnipq_update(void); static void ipq_zone_change(void *); static void ip_drain_locked(void); static VNET_DEFINE(int, maxnipq); /* Administrative limit on # reass queues. */ static VNET_DEFINE(int, nipq); /* Total # of reass queues */ #define V_maxnipq VNET(maxnipq) #define V_nipq VNET(nipq) SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(nipq), 0, "Current number of IPv4 fragment reassembly queue entries"); static VNET_DEFINE(int, maxfragsperpacket); #define V_maxfragsperpacket VNET(maxfragsperpacket) SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif static void ip_freef(struct ipqhead *, struct ipq *); /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct protosw *pr; int i; V_ip_id = time_second & 0xffff; TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ for (i = 0; i < IPREASS_NHASH; i++) TAILQ_INIT(&V_ipq[i]); V_maxnipq = nmbclusters / 32; V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxnipq_update(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ IPQ_LOCK_INIT(); netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE void ip_destroy(void) { int i; if ((i = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, i); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); IPQ_LOCK(); ip_drain_locked(); IPQ_UNLOCK(); uma_zdestroy(V_ipq_zone); } #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip_ipsec_filtertunnel(m)) goto passin; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL); if (dchg != 0) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ /* IN_IFADDR_RLOCK(); */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } } /* IN_IFADDR_RUNLOCK(); */ /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #endif } IF_ADDR_RUNLOCK(ifp); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { #ifdef IPSEC if (ip_ipsec_fwd(m)) goto bad; #endif /* IPSEC */ ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip_ipsec_input(m)) goto bad; #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * After maxnipq has been updated, propagate the change to UMA. The UMA zone * max has slightly different semantics than the sysctl, for historical * reasons. */ static void maxnipq_update(void) { /* * -1 for unlimited allocation. */ if (V_maxnipq < 0) uma_zone_set_max(V_ipq_zone, 0); /* * Positive number for specific bound. */ if (V_maxnipq > 0) uma_zone_set_max(V_ipq_zone, V_maxnipq); /* * Zero specifies no further fragment queue allocation -- set the * bound very low, but rely on implementation elsewhere to actually * prevent allocation and reclaim current queues. */ if (V_maxnipq == 0) uma_zone_set_max(V_ipq_zone, 1); } static void ipq_zone_change(void *tag) { if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { V_maxnipq = nmbclusters / 32; maxnipq_update(); } } static int sysctl_maxnipq(SYSCTL_HANDLER_ARGS) { int error, i; i = V_maxnipq; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); /* * XXXRW: Might be a good idea to sanity check the argument and place * an extreme upper bound. */ if (i < -1) return (EINVAL); V_maxnipq = i; maxnipq_update(); return (0); } SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, sysctl_maxnipq, "I", "Maximum number of IPv4 fragment reassembly queue entries"); #define M_IP_FRAG M_PROTO9 /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf * in the fragment chain. If the argument is the last fragment * the packet will be reassembled and the pointer to the new * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. */ struct mbuf * ip_reass(struct mbuf *m) { struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp = NULL; struct ipqhead *head; int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; #ifdef RSS uint32_t rss_hash, rss_type; #endif /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); head = &V_ipq[hash]; IPQ_LOCK(); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_ipq_match(m, fp) && #endif ip->ip_p == fp->ipq_p) goto found; fp = NULL; /* * Attempt to trim the number of allocated fragment queues if it * exceeds the administrative limit. */ if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { /* * drop something from the tail of the current queue * before proceeding further */ struct ipq *q = TAILQ_LAST(head, ipqhead); if (q == NULL) { /* gak */ for (i = 0; i < IPREASS_NHASH; i++) { struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); if (r) { IPSTAT_ADD(ips_fragtimeout, r->ipq_nfrags); ip_freef(&V_ipq[i], r); break; } } } else { IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); ip_freef(head, q); } } found: /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len = htons(ntohs(ip->ip_len) - hlen); if (ip->ip_off & htons(IP_MF)) { /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) { IPSTAT_INC(ips_toosmall); /* XXX */ goto dropfrag; } m->m_flags |= M_IP_FRAG; } else m->m_flags &= ~M_IP_FRAG; ip->ip_off = htons(ntohs(ip->ip_off) << 3); /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ IPSTAT_INC(ips_fragments); m->m_pkthdr.PH_loc.ptr = ip; /* Previous ip_reass() started here. */ /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); V_nipq++; fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; m->m_nextpkt = NULL; goto done; } else { fp->ipq_nfrags++; #ifdef MAC mac_ipq_update(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = ip->ip_tos & IPTOS_ECN_MASK; ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) goto dropfrag; if (ecn0 != IPTOS_ECN_CE) GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) goto dropfrag; /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the preceding * segment, then it's checksum is invalidated. */ if (p) { i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - ntohs(ip->ip_off); if (i > 0) { if (i >= ntohs(ip->ip_len)) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > ntohs(GETIP(q)->ip_off); q = nq) { i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - ntohs(GETIP(q)->ip_off); if (i < ntohs(GETIP(q)->ip_len)) { GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; m_freem(q); } /* * Check for complete reassembly and perform frag per packet * limiting. * * Frag limiting is performed here so that the nth frag has * a chance to complete the packet before we drop the packet. * As a result, n+1 frags are actually allowed per packet, but * only n will ever be stored. (n = maxfragsperpacket.) * */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ip_freef(head, fp); } goto done; } next += ntohs(GETIP(q)->ip_len); } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ip_freef(head, fp); } goto done; } /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ip_freef(head, fp); goto done; } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = NULL; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_cat(m, q); } /* * In order to do checksumming faster we do 'end-around carry' here * (and not in for{} loop), though it implies we are not going to * reassemble more than 64k fragments. */ while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); #endif /* * Create header for new ip packet by modifying header of first * packet; dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); V_nipq--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(); #ifdef RSS /* * Query the RSS layer for the flowid / flowtype for the * mbuf payload. * * For now, just assume we have to calculate a new one. * Later on we should check to see if the assigned flowid matches * what RSS wants for the given IP protocol and if so, just keep it. * * We then queue into the relevant netisr so it can be dispatched * to the correct CPU. * * Note - this may return 1, which means the flowid in the mbuf * is correct for the configured RSS hash types and can be used. */ if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { m->m_pkthdr.flowid = rss_hash; M_HASHTYPE_SET(m, rss_type); - m->m_flags |= M_FLOWID; } /* * Queue/dispatch for reprocessing. * * Note: this is much slower than just handling the frame in the * current receive context. It's likely worth investigating * why this is. */ netisr_dispatch(NETISR_IP_DIRECT, m); return (NULL); #endif /* Handle in-line */ return (m); dropfrag: IPSTAT_INC(ips_fragdropped); if (fp != NULL) fp->ipq_nfrags--; m_freem(m); done: IPQ_UNLOCK(); return (NULL); #undef GETIP } /* * Free a fragment reassembly header and all * associated datagrams. */ static void ip_freef(struct ipqhead *fhp, struct ipq *fp) { struct mbuf *q; IPQ_LOCK_ASSERT(); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(fhp, fp, ipq_list); uma_zfree(V_ipq_zone, fp); V_nipq--; } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); struct ipq *fp; int i; VNET_LIST_RLOCK_NOSLEEP(); IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { struct ipq *fpp; fpp = fp; fp = TAILQ_NEXT(fp, ipq_list); if(--fpp->ipq_ttl == 0) { IPSTAT_ADD(ips_fragtimeout, fpp->ipq_nfrags); ip_freef(&V_ipq[i], fpp); } } } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. */ if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { for (i = 0; i < IPREASS_NHASH; i++) { while (V_nipq > V_maxnipq && !TAILQ_EMPTY(&V_ipq[i])) { IPSTAT_ADD(ips_fragdropped, TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } } CURVNET_RESTORE(); } IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Drain off all datagram fragments. */ static void ip_drain_locked(void) { int i; IPQ_LOCK_ASSERT(); for (i = 0; i < IPREASS_NHASH; i++) { while(!TAILQ_EMPTY(&V_ipq[i])) { IPSTAT_ADD(ips_fragdropped, TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ip_drain_locked(); CURVNET_RESTORE(); } IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } /* * Given address of next destination (final or next hop), return (referenced) * internet address info of interface to be used to get there. */ struct in_ifaddr * ip_rtaddr(struct in_addr dst, u_int fibnum) { struct route sro; struct sockaddr_in *sin; struct in_ifaddr *ia; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; in_rtalloc_ign(&sro, 0, fibnum); if (sro.ro_rt == NULL) return (NULL); ia = ifatoia(sro.ro_rt->rt_ifa); ifa_ref(&ia->ia_ifa); RTFREE(sro.ro_rt); return (ia); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } #ifdef IPSTEALTH } #endif ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); #ifndef IPSEC /* * 'ia' may be NULL if there is no route for this destination. * In case of IPsec, Don't discard it just yet, but pass it to * ip_output in case of outgoing IPsec policy. */ if (!srcrt && ia == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return; } #endif /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copy() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (!V_ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct sockaddr_in *sin; struct rtentry *rt; bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; in_rtalloc_ign(&ro, 0, M_GETFIB(m)); rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } if (rt) RTFREE(rt); } /* * Try to cache the route MTU from ip_output so we can consider it for * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. */ bzero(&ro, sizeof(ro)); error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } } if (mcopy == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return; } switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifdef IPSEC /* * If IPsec is configured for this path, * override any possibly mtu value set by ip_output. */ mtu = ip_ipsec_mtu(mcopy, mtu); #endif /* IPSEC */ /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } if (ia != NULL) ifa_free(&ia->ia_ifa); icmp_error(mcopy, type, code, dest.s_addr, mtu); } void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; bintime2timeval(&bt, &tv); *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ static VNET_DEFINE(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 275357) +++ head/sys/netinet/ip_output.c (revision 275358) @@ -1,1375 +1,1373 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #ifdef IPSEC #include #include #endif /* IPSEC*/ #include #include VNET_DEFINE(u_short, ip_id); #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; int isbroadcast; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ struct in_addr odst; struct m_tag *fwd_tag = NULL; uint32_t fibnum; int have_ia_ref; int needfiblookup; #ifdef IPSEC int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (((flags & IP_NODEFAULTFLOWID) == 0) && - inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); - m->m_flags |= M_FLOWID; } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* * Fill in IP header. If we are not allowing fragmentation, * then the ip_id field is meaningless, but we don't set it * to zero. Doing so causes various problems when devices along * the path (routers, load balancers, firewalls, etc.) illegally * disable DF on our packet. Note that a 16-bit counter * will wrap around in less than 10 seconds at 100 Mbit/s on a * medium with MTU 1500. See Steven M. Bellovin, "A Technique * for Counting NATted Hosts", Proc. IMW'02, available at * . */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } /* * dst/gw handling: * * dst can be rewritten but always points to &ro->ro_dst. * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ gw = dst = (struct sockaddr_in *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: ia = NULL; have_ia_ref = 0; /* * If there is a cached route, check that it is to the same * destination and is still up. If not, free it and try again. * The address family should also be checked in case of sharing * the cache with IPv6. */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RO_RTFREE(ro); ro->ro_lle = NULL; rte = NULL; gw = dst; } if (rte == NULL && fwd_tag == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); if (ia) have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (rte == NULL) { #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), fibnum); #else in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; } if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ia = ifatoia(rte->rt_ifa); ifp = rte->rt_ifp; counter_u64_add(rte->rt_pksent, 1); if (rte->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(gw->sin_addr, ifp); } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) mtu = rte->rt_mtu; else mtu = ifp->if_mtu; /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, dst, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC switch(ip_ipsec_output(&m, inp, &flags, &error)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IP, m); goto done; } else { if (have_ia_ref) ifa_free(&ia->ia_ifa); needfiblookup = 1; /* Redo the routing table lookup. */ } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_RTFREE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); if (have_ia_ref) ifa_free(&ia->ia_ifa); goto again; } passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: if (ro == &iproute) RO_RTFREE(ro); if (have_ia_ref) ifa_free(&ia->ia_ifa); return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; struct mbuf *m; for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) off += m->m_len; /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } m->m_flags |= (m0->m_flags & M_MCAST); /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_netinet_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; ip_len = ntohs(ip->ip_len); csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ /* find the mbuf in the chain where the checksum starts*/ while ((m != NULL) && (offset >= m->m_len)) { offset -= m->m_len; m = m->m_next; } KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain.")); KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs.")); *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(inp, sopt->sopt_name, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, int hlen) { register struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif if_simloop(ifp, copym, dst->sin_family, 0); } } Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c (revision 275357) +++ head/sys/netinet/tcp_input.c (revision 275358) @@ -1,3680 +1,3678 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #include #endif /*IPSEC*/ #include #include const int tcprexmtthresh = 3; int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; #define V_drop_synfin VNET(drop_synfin) SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0, "Experimental TCP extensions"); VNET_DEFINE(int, tcp_do_initcwnd10) = 1; SYSCTL_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_initcwnd10), 0, "Enable RFC 6928 (Increasing initial CWND to 10)"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 0; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); VNET_DEFINE(int, tcp_insecure_syn) = 0; #define V_tcp_insecure_syn VNET(tcp_insecure_syn) SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; #define V_tcp_insecure_rst VNET(tcp_insecure_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_inc), 0, "Incrementor step size of automatic receive buffer"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type); static void inline cc_conn_init(struct tcpcb *tp); static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void inline hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); /* * TCP statistics are stored in an "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); VNET_PCPUSTAT_SYSINIT(tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(tcpstat); #endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_inc(int statnum) { counter_u64_add(VNET(tcpstat)[statnum], 1); } /* * Wrapper for the TCP established input helper hook. */ static void inline hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, tp->osd); } } /* * CC wrapper hook functions */ static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if (tp->snd_cwnd <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, V_tcp_abc_l_var * tp->t_maxseg); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } } static void inline cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ else if (V_tcp_do_initcwnd10) tp->snd_cwnd = min(10 * tp->t_maxseg, max(2 * tp->t_maxseg, 14600)); else if (V_tcp_do_rfc3390) tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (tp->t_maxseg > 2190) tp->snd_cwnd = 2 * tp->t_maxseg; else if (tp->t_maxseg > 1095) tp->snd_cwnd = 3 * tp->t_maxseg; else tp->snd_cwnd = 4 * tp->t_maxseg; } if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags)) { TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg) * tp->t_maxseg; tp->snd_cwnd = tp->t_maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); /* XXXLAS: KASSERT that we're in recovery? */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* XXXLAS: EXIT_RECOVERY ? */ tp->t_bytes_acked = 0; } #ifdef TCP_SIGNATURE static inline int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) { int ret; tcp_fields_to_net(th); ret = tcp_signature_verify(m, off0, tlen, optlen, to, th, tcpbflag); tcp_fields_to_host(th); return (ret); } #endif /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxopd) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ifa_free(&ia6->ia_ifa); ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return (IPPROTO_DONE); } if (ia6) ifa_free(&ia6->ia_ifa); return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ int tcp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ #ifdef TCP_SIGNATURE uint8_t sig_checked = 0; #endif uint8_t iptos = 0; struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 #define TI_WLOCKED 2 #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); #ifdef INET6 if (isipv6) { /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(tlen + off0); } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } /* Re-initialization for later version check */ ip->ip_v = IPVERSION; } #endif /* INET */ #ifdef INET6 if (isipv6) iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET iptos = ip->ip_tos; #endif /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Locate pcb for segment; if we're likely to add or remove a * connection then first acquire pcbinfo lock. There are three cases * where we might discover later we need a write lock despite the * flags: ACKs moving a connection out of the syncache, ACKs for a * connection in TIMEWAIT and SYNs not targeting a listening socket. */ if ((thflags & (TH_FIN | TH_RST)) != 0) { INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; } else ti_locked = TI_UNLOCKED; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); findpcb: #ifdef INVARIANTS if (ti_locked == TI_WLOCKED) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_WLOCK_ASSERT(inp); - if (!(inp->inp_flags & INP_HW_FLOWID) - && (m->m_flags & M_FLOWID) - && ((inp->inp_socket == NULL) - || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { - inp->inp_flags |= INP_HW_FLOWID; - inp->inp_flags &= ~INP_SW_FLOWID; + if ((inp->inp_flowtype == M_HASHTYPE_NONE) && + (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && + ((inp->inp_socket == NULL) || + (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { IPSEC6STAT_INC(ips_in_polvio); goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { IPSECSTAT_INC(ips_in_polvio); goto dropunlock; } #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. * * At this point, due to earlier optimism, we may hold only an inpcb * lock, and not the inpcbinfo write lock. If so, we need to try to * acquire it, or if that fails, acquire a reference on the inpcb, * drop all locks, acquire a global write lock, and then re-acquire * the inpcb lock. We may at that point discover that another thread * has tried to free the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. * * XXXRW: It may be time to rethink timewait locking. */ relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } } else ti_locked = TI_WLOCKED; } INP_INFO_WLOCK_ASSERT(&V_tcbinfo); if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; INP_INFO_WUNLOCK(&V_tcbinfo); return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif /* * We've identified a valid inpcb, but it could be that we need an * inpcbinfo write lock but don't hold it. In this case, attempt to * acquire using the same strategy as the TIMEWAIT case above. If we * relock, we have to jump back to 'relocked' as the connection might * now be in TIMEWAIT. */ #ifdef INVARIANTS if ((thflags & (TH_FIN | TH_RST)) != 0) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #endif if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN)))) { if (ti_locked == TI_UNLOCKED) { if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } goto relocked; } else ti_locked = TI_WLOCKED; } INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); } else #endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ if (!syncache_expand(&inc, &to, th, &so, m)) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); INP_WLOCK(inp); /* new connection */ tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); #ifdef TCP_SIGNATURE if (sig_checked == 0) { tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to, th, tp->t_flags)) { /* * In SYN_SENT state if it receives an * RST, it is allowed for further * processing. */ if ((thflags & TH_RST) == 0 || (tp->t_state == TCPS_SYN_SENT) == 0) goto dropunlock; } sig_checked = 1; } #endif /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { ifa_free(&ia6->ia_ifa); if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (ia6) ifa_free(&ia6->ia_ifa); } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the * ACCEPT_LOCK, opening a race condition allowing a SYN * attempt go through unhandled. */ goto dropunlock; } #ifdef TCP_SIGNATURE if (sig_checked == 0) { tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to, th, tp->t_flags)) { /* * In SYN_SENT state if it receives an RST, it is * allowed for further processing. */ if ((thflags & TH_RST) == 0 || (tp->t_state == TCPS_SYN_SENT) == 0) goto dropunlock; } sig_checked = 1; } #endif TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif if (inp != NULL) INP_WUNLOCK(inp); drop: INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { int thflags, acked, ourfinisacked, needoutput = 0; int rstreason, todrop, win; u_long tiwin; char *s; struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS if (ti_locked == TI_WLOCKED) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { u_int t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* Some progress has been made. */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); if (sbavail(&so->so_snd)) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); * 3. received bytes per RTT is within seven eighth of the * current socket buffer size; * 4. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ if (V_tcp_do_autorcvbuf && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && to.to_tsecr - tp->rfbuf_ts < hz) { if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); } /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else tp->rfbuf_cnt += tlen; /* add up */ } /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); /* Reset receive buffer auto scaling when not in bulk receive mode. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += imin(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_WLOCKED, ("%s: TH_RST ti_locked %d, th %p tp %p", __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { KASSERT(ti_locked == TI_WLOCKED, ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, &to, th->th_ack); /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { cc_ack_received(tp, th, CC_DUPACK); u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { u_int t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { tp->snd_wnd -= sbavail(&so->so_snd); mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp, tlen)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, ti_locked)); tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); return; } } if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); #endif /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_WLOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; #endif case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tp->t_inpcb); TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; u_long maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; int origoffer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } origoffer = offer; /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxopd above. */ offer = tp->t_maxopd; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ mss = max(mss, 64); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * origoffer==-1 indicates that no segments were received yet. * In this case we just guess. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int mss; u_long bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; } Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c (revision 275357) +++ head/sys/netinet/tcp_syncache.c (revision 275358) @@ -1,2046 +1,2044 @@ /*- * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include static VNET_DEFINE(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); static VNET_DEFINE(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); #ifdef TCP_OFFLOAD #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); static int syncache_respond(struct syncache *, struct syncache_head *, int); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso); #endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 static VNET_DEFINE(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, "Per-bucket hash limit for syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncache.rexmt_limit), 0, "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SYNCACHE_HASH(inc, mask) \ ((V_tcp_syncache.hash_secret ^ \ (inc)->inc_faddr.s_addr ^ \ ((inc)->inc_faddr.s_addr >> 16) ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define SYNCACHE_HASH6(inc, mask) \ ((V_tcp_syncache.hash_secret ^ \ (inc)->inc6_faddr.s6_addr32[0] ^ \ (inc)->inc6_faddr.s6_addr32[3] ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define ENDPTS_EQ(a, b) ( \ (a)->ie_fport == (b)->ie_fport && \ (a)->ie_lport == (b)->ie_lport && \ (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ ) #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { int i; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &V_tcp_syncache.bucket_limit); if (!powerof2(V_tcp_syncache.hashsize) || V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ V_tcp_syncache.cache_limit = V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); #ifdef VIMAGE V_tcp_syncache.vnet = curvnet; #endif /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; } /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); /* Start the SYN cookie reseeder callout. */ callout_init(&V_tcp_syncache.secret.reseed, 1); arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, &V_tcp_syncache); } #ifdef VIMAGE void syncache_destroy(void) { struct syncache_head *sch; struct syncache *sc, *nsc; int i; /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; callout_drain(&sch->sch_timer); SCH_LOCK(sch); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) syncache_drop(sc, sch); SCH_UNLOCK(sch); KASSERT(TAILQ_EMPTY(&sch->sch_bucket), ("%s: sch->sch_bucket not empty", __func__)); KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", __func__, sch->sch_length)); mtx_destroy(&sch->sch_mtx); } KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, ("%s: cache_count not 0", __func__)); /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); callout_drain(&V_tcp_syncache.secret.reseed); } #endif /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); syncache_drop(sc2, sch); TCPSTAT_INC(tcps_sc_bucketoverflow); } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_added(tod, sc->sc_todctx); } #endif /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_added); } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { SCH_LOCK_ASSERT(sch); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif syncache_free(sc); } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { sc->sc_rxttime = ticks + TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; struct syncache *sc, *nsc; int tick = ticks; char *s; CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_stale); continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } syncache_respond(sc, sch, 1); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); CURVNET_RESTORE(); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ static struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { sch = &V_tcp_syncache.hashbase[ SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } else #endif { sch = &V_tcp_syncache.hashbase[ SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) continue; #endif if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } SCH_LOCK_ASSERT(*schp); return (NULL); /* always returns with locked sch */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; char *s = NULL; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. */ if (SEQ_GEQ(th->th_seq, sc->sc_irs) && SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote endpoint\n", s, __func__); TCPSTAT_INC(tcps_sc_reset); } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " "IRS %u (+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); TCPSTAT_INC(tcps_badrst); } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_badack); } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th->th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_unreach); done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; int error; char *s; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ so = sonewconn(lso, 0); if (so == NULL) { /* * Drop the connection; we will either send a RST or * have the peer retransmit its SYN again after its * RTO and try again. */ TCPSTAT_INC(tcps_listendrop); if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } goto abort2; } #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif /* * If there's an mbuf and it has a flowid, then let's initialise the * inp with that particular flowid. */ - if (m != NULL && m->m_flags & M_FLOWID) { - inp->inp_flags |= INP_HW_FLOWID; - inp->inp_flags &= ~INP_SW_FLOWID; + if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } /* * Install in the reservation hash table for now, but don't yet * install a connection group since the full 4-tuple isn't yet * configured. */ inp->inp_lport = sc->sc_inc.inc_lport; if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) inp->in6p_laddr = in6addr_any; else #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbinshash failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } #ifdef IPSEC /* Copy old policy into new socket's. */ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) printf("syncache_socket: could not copy policy\n"); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { struct inpcb *oinp = sotoinpcb(lso); struct in6_addr laddr6; struct sockaddr_in6 sin6; /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, thread0.td_ucred, m)) != 0) { inp->in6p_laddr = laddr6; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; inp->inp_flow |= sc->sc_flowlabel; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, thread0.td_ucred, m)) != 0) { inp->inp_laddr = laddr; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } } #endif /* INET */ INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); tcp_state_change(tp, TCPS_SYN_RECEIVED); tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = tcp_ts_getticks(); tp->ts_offset = sc->sc_tsoff; } #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } if (sc->sc_flags & SCF_ECN) tp->t_flags |= TF_ECN_PERMIT; /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, indicate that CWND to be * limited to one segment in cc_conn_init(). * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. */ if (sc->sc_rxmits > 1) tp->snd_cwnd = 1; #ifdef TCP_OFFLOAD /* * Allow a TOE driver to install its hooks. Note that we hold the * pcbinfo lock too and that prevents tcp_usr_accept from accepting a * new connection before the TOE driver has done its thing. */ if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_offload_socket(tod, sc->sc_todctx, so); } #endif /* * Copy and activate timers. */ tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); INP_WUNLOCK(inp); soisconnected(so); TCPSTAT_INC(tcps_accepts); return (so); abort: INP_WUNLOCK(inp); abort2: if (so != NULL) soabort(so); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); #ifdef INVARIANTS /* * Test code for syncookies comparing the syncache stored * values with the reconstructed values from the cookie. */ if (sc != NULL) syncookie_cmp(inc, sch, sc, th, to, *lsop); #endif if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. See if this socket has had a syncache entry dropped in * the past. We don't want to accept a bogus syncookie * if we've never received a SYN. * B. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (!V_tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } } else { /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ if (SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } /* * If timestamps were not negotiated during SYN/ACK they * must not appear on any segment during this session. */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session. * XXXAO: This is only informal as there have been unverified * reports of non-compliants stacks. */ if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); s = NULL; } } /* * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", s, __func__, to->to_tsecr, sc->sc_ts); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) TCPSTAT_INC(tcps_sc_aborted); else TCPSTAT_INC(tcps_sc_completed); /* how do we find the inp for the new socket? */ if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL && sc != &scs) syncache_free(sc); if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ void syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, void *todctx) { struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int ltflags; int win, sb_hiwat, ip_ttl, ip_tos; char *s; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; struct ucred *cred; INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ so = *lsop; tp = sototcpcb(so); cred = crhold(so->so_cred); #ifdef INET6 if ((inc->inc_flags & INC_ISIPV6) && (inp->inp_flags & IN6P_AUTOFLOWLABEL)) autoflowlabel = 1; #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; win = sbspace(&so->so_rcv); sb_hiwat = so->so_rcv.sb_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); /* By the time we drop the lock these should no longer be used. */ so = NULL; tp = NULL; #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_WUNLOCK(inp); goto done; } else mac_syncache_create(maclabel, inp); #endif INP_WUNLOCK(inp); /* * Remember the IP options, if any. */ #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif #ifdef INET ipopts = (m) ? ip_srcroute(m) : NULL; #else ipopts = NULL; #endif /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); #endif /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (syncache_respond(sc, sch, 1) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } SCH_UNLOCK(sch); goto done; } sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ TCPSTAT_INC(tcps_sc_zonefail); if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) syncache_drop(sc, sch); sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (V_tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_cred = cred; cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif { sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } #ifdef TCP_OFFLOAD sc->sc_tod = tod; sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (V_tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; sc->sc_ts = tcp_ts_getticks(); sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatiblity problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #ifdef TCP_SIGNATURE /* * If listening socket requested TCP digests, OR received SYN * contains the option, flag this in the syncache so that * syncache_respond() will do the right thing with the SYN+ACK. */ if (to->to_flags & TOF_SIGNATURE || ltflags & TF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (ltflags & TF_NOOPT) sc->sc_flags |= SCF_NOOPT; if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) sc->sc_flowlabel = sc->sc_iss; else sc->sc_flowlabel = ip6_randomflowlabel(); sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } #endif SCH_UNLOCK(sch); /* * Do a standard 3-way handshake. */ if (syncache_respond(sc, sch, 0) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } else { if (sc != &scs) syncache_free(sc); TCPSTAT_INC(tcps_sc_dropped); } done: if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif if (m) { *lsop = NULL; m_freem(m); } } static int syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; int optlen, error = 0; /* Make compiler happy */ u_int16_t hlen, tlen, mssopt; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif #ifdef TCP_SIGNATURE struct secasvar *sav; #endif hlen = #ifdef INET6 (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_peer_mss) mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small")); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; ip6->ip6_flow |= sc->sc_flowlabel; th = (struct tcphdr *)(ip6 + 1); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = htons(tlen); ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= htons(IP_DF); th = (struct tcphdr *)(ip + 1); } #endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_x2 = 0; th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; if (sc->sc_flags & SCF_ECN) { th->th_flags |= TH_ECE; TCPSTAT_INC(tcps_ecn_shs); } /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) { /* Virgin timestamp or TCP cookie enhanced one. */ to.to_tsval = sc->sc_ts; to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #ifdef TCP_SIGNATURE sav = NULL; if (sc->sc_flags & SCF_SIGNATURE) { sav = tcp_get_sav(m, IPSEC_DIR_OUTBOUND); if (sav != NULL) to.to_flags |= TOF_SIGNATURE; else { /* * We've got SCF_SIGNATURE flag * inherited from listening socket, * but no SADB key for given source * address. Assume signature is not * required and remove signature flag * instead of silently dropping * connection. */ if (locked == 0) SCH_LOCK(sch); sc->sc_flags &= ~SCF_SIGNATURE; if (locked == 0) SCH_UNLOCK(sch); } } #endif optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tcp_signature_do_compute(m, 0, optlen, to.to_signature, sav); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len = htons(ntohs(ip->ip_len) + optlen); } else optlen = 0; M_SETFIB(m, sc->sc_inc.inc_fibnum); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(NULL, NULL); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } /* * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks * that exceed the capacity of the syncache by avoiding the storage of any * of the SYNs we receive. Syncookies defend against blind SYN flooding * attacks where the attacker does not have access to our responses. * * Syncookies encode and include all necessary information about the * connection setup within the SYN|ACK that we send back. That way we * can avoid keeping any local state until the ACK to our SYN|ACK returns * (if ever). Normally the syncache and syncookies are running in parallel * with the latter taking over when the former is exhausted. When matching * syncache entry is found the syncookie is ignored. * * The only reliable information persisting the 3WHS is our inital sequence * number ISS of 32 bits. Syncookies embed a cryptographically sufficient * strong hash (MAC) value and a few bits of TCP SYN options in the ISS * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK * returns and signifies a legitimate connection if it matches the ACK. * * The available space of 32 bits to store the hash and to encode the SYN * option information is very tight and we should have at least 24 bits for * the MAC to keep the number of guesses by blind spoofing reasonably high. * * SYN option information we have to encode to fully restore a connection: * MSS: is imporant to chose an optimal segment size to avoid IP level * fragmentation along the path. The common MSS values can be encoded * in a 3-bit table. Uncommon values are captured by the next lower value * in the table leading to a slight increase in packetization overhead. * WSCALE: is necessary to allow large windows to be used for high delay- * bandwidth product links. Not scaling the window when it was initially * negotiated is bad for performance as lack of scaling further decreases * the apparent available send window. We only need to encode the WSCALE * we received from the remote end. Our end can be recalculated at any * time. The common WSCALE values can be encoded in a 3-bit table. * Uncommon values are captured by the next lower value in the table * making us under-estimate the available window size halving our * theoretically possible maximum throughput for that connection. * SACK: Greatly assists in packet loss recovery and requires 1 bit. * TIMESTAMP and SIGNATURE is not encoded because they are permanent options * that are included in all segments on a connection. We enable them when * the ACK has them. * * Security of syncookies and attack vectors: * * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) * together with the gloabl secret to make it unique per connection attempt. * Thus any change of any of those parameters results in a different MAC output * in an unpredictable way unless a collision is encountered. 24 bits of the * MAC are embedded into the ISS. * * To prevent replay attacks two rotating global secrets are updated with a * new random value every 15 seconds. The life-time of a syncookie is thus * 15-30 seconds. * * Vector 1: Attacking the secret. This requires finding a weakness in the * MAC itself or the way it is used here. The attacker can do a chosen plain * text attack by varying and testing the all parameters under his control. * The strength depends on the size and randomness of the secret, and the * cryptographic security of the MAC function. Due to the constant updating * of the secret the attacker has at most 29.999 seconds to find the secret * and launch spoofed connections. After that he has to start all over again. * * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC * size an average of 4,823 attempts are required for a 50% chance of success * to spoof a single syncookie (birthday collision paradox). However the * attacker is blind and doesn't know if one of his attempts succeeded unless * he has a side channel to interfere success from. A single connection setup * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. * This many attempts are required for each one blind spoofed connection. For * every additional spoofed connection he has to launch another N attempts. * Thus for a sustained rate 100 spoofed connections per second approximately * 1,800,000 packets per second would have to be sent. * * NB: The MAC function should be fast so that it doesn't become a CPU * exhaustion attack vector itself. * * References: * RFC4987 TCP SYN Flooding Attacks and Common Mitigations * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (details) * * * Schematic construction of a syncookie enabled Initial Sequence Number: * 0 1 2 3 * 12345678901234567890123456789012 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| * * x 24 MAC (truncated) * W 3 Send Window Scale index * M 3 MSS index * S 1 SACK permitted * P 1 Odd/even secret */ /* * Distribution and probability of certain MSS values. Those in between are * rounded down to the next lower one. * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] * .2% .3% 5% 7% 7% 20% 15% 45% */ static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; /* * Distribution and probability of certain WSCALE values. We have to map the * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 * bits based on prevalence of certain values. Where we don't have an exact * match for are rounded down to the next lower one letting us under-estimate * the true available window. At the moment this would happen only for the * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer * and window size). The absence of the WSCALE option (no scaling in either * direction) is encoded with index zero. * [WSCALE values histograms, Allman, 2012] * X 10 10 35 5 6 14 10% by host * X 11 4 5 5 18 49 3% by connections */ static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; /* * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed * and good cryptographic properties. */ static uint32_t syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, uint8_t *secbits, uintptr_t secmod) { SIPHASH_CTX ctx; uint32_t siphash[2]; SipHash24_Init(&ctx); SipHash_SetKey(&ctx, secbits); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); break; #endif } SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); SipHash_Update(&ctx, &flags, sizeof(flags)); SipHash_Update(&ctx, &secmod, sizeof(secmod)); SipHash_Final((u_int8_t *)&siphash, &ctx); return (siphash[0] ^ siphash[1]); } static tcp_seq syncookie_generate(struct syncache_head *sch, struct syncache *sc) { u_int i, mss, secbit, wscale; uint32_t iss, hash; uint8_t *secbits; union syncookie cookie; SCH_LOCK_ASSERT(sch); cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); for (i = sizeof(tcp_sc_msstab) / sizeof(*tcp_sc_msstab) - 1; tcp_sc_msstab[i] > mss && i > 0; i--) ; cookie.flags.mss_idx = i; /* * Map the send window scale into the 3-bit index but only if * the wscale option was received. */ if (sc->sc_flags & SCF_WINSCALE) { wscale = sc->sc_requested_s_scale; for (i = sizeof(tcp_sc_wstab) / sizeof(*tcp_sc_wstab) - 1; tcp_sc_wstab[i] > wscale && i > 0; i--) ; cookie.flags.wscale_idx = i; } /* Can we do SACK? */ if (sc->sc_flags & SCF_SACK) cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ secbit = sch->sch_sc->secret.oddeven & 0x1; cookie.flags.odd_even = secbit; secbits = sch->sch_sc->secret.key[secbit]; hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); /* * Put the flags into the hash and XOR them to get better ISS number * variance. This doesn't enhance the cryptographic strength and is * done to prevent the 8 cookie bits from showing up directly on the * wire. */ iss = hash & ~0xff; iss |= cookie.cookie ^ (hash >> 24); /* Randomize the timestamp. */ if (sc->sc_flags & SCF_TIMESTAMP) { sc->sc_ts = arc4random(); sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); } TCPSTAT_INC(tcps_sc_sendcookie); return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { uint32_t hash; uint8_t *secbits; tcp_seq ack, seq; int wnd, wscale = 0; union syncookie cookie; SCH_LOCK_ASSERT(sch); /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; /* * Unpack the flags containing enough information to restore the * connection. */ cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) return (NULL); /* Fill in the syncache values. */ sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; break; #endif #ifdef INET6 case INC_ISIPV6: if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; break; #endif } sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; /* We can simply recompute receive window scale we sent earlier. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; /* Only use wscale if it was enabled in the orignal SYN. */ if (cookie.flags.wscale_idx > 0) { sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; sc->sc_flags |= SCF_WINSCALE; } wnd = sbspace(&lso->so_rcv); wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; if (cookie.flags.sack_ok) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); } if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); scx = syncookie_lookup(inc, sch, &scs, th, to, lso); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); if (scx != NULL) { if (sc->sc_peer_mss != scx->sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, scx->sc_requested_r_scale); if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, scx->sc_requested_s_scale); if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); } if (s != NULL) free(s, M_TCPLOG); return (0); } #endif /* INVARIANTS */ static void syncookie_reseed(void *arg) { struct tcp_syncache *sc = arg; uint8_t *secbits; int secbit; /* * Reseeding the secret doesn't have to be protected by a lock. * It only must be ensured that the new random values are visible * to all CPUs in a SMP environment. The atomic with release * semantics ensures that. */ secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; secbits = sc->secret.key[secbit]; arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); atomic_add_rel_int(&sc->secret.oddeven, 1); /* Reschedule ourself. */ callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } /* * Returns the current number of syncache entries. This number * will probably change before you get around to calling * syncache_pcblist. */ int syncache_pcbcount(void) { struct syncache_head *sch; int count, i; for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { /* No need to lock for a read. */ sch = &V_tcp_syncache.hashbase[i]; count += sch->sch_length; } return count; } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int count, error, i; for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (count >= max_pcbs) { SCH_UNLOCK(sch); goto exit; } if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); if (sc->sc_inc.inc_flags & INC_ISIPV6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); xt.xt_tp.t_inpcb = &xt.xt_inp; xt.xt_tp.t_state = TCPS_SYN_RECEIVED; xt.xt_socket.xso_protocol = IPPROTO_TCP; xt.xt_socket.xso_len = sizeof (struct xsocket); xt.xt_socket.so_type = SOCK_STREAM; xt.xt_socket.so_state = SS_ISCONNECTING; error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); goto exit; } count++; } SCH_UNLOCK(sch); } exit: *pcbs_exported = count; return error; } Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 275357) +++ head/sys/netinet/udp_usrreq.c (revision 275358) @@ -1,1903 +1,1899 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef IPSEC #include #include #endif #include #include /* * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. * Per RFC 3828, July, 2004. */ /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); u_long udp_sendspace = 9216; /* really max datagram size */ /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); static VNET_DEFINE(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ VNET_PCPUSTAT_SYSINIT(udpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); #endif #ifdef IPSEC #ifdef IPSEC_NAT_T #define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP) #ifdef INET static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int); #endif #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ static void udp_zone_change(void *tag) { uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_udpcb_zone, maxsockets); } static int udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); } static int udplite_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpliteinp"); return (0); } void udp_init(void) { /* * For now default to 2-tuple UDP hashing - until the fragment * reassembly code can also update the flowid. * * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udpcb_zone, maxsockets); uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void udplite_init(void) { in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); } /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the * general layout of udpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_udpstat_inc(int statnum) { counter_u64_add(VNET(udpstat)[statnum], 1); } int udp_newudpcb(struct inpcb *inp) { struct udpcb *up; up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO); if (up == NULL) return (ENOBUFS); inp->inp_ppcb = up; return (0); } void udp_discardcb(struct udpcb *up) { uma_zfree(V_udpcb_zone, up); } #ifdef VIMAGE void udp_destroy(void) { in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); } void udplite_destroy(void) { in_pcbinfo_destroy(&V_ulitecbinfo); } #endif #ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. */ static void udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *opts = 0; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { (*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in, up->u_tun_ctx); return; } off += sizeof(struct udphdr); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); IPSECSTAT_INC(ips_in_polvio); return; } #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ n = udp4_espdecap(inp, n, off); if (n == NULL) /* Consumed. */ return; } #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return; } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif /* INET6 */ append_sa = (struct sockaddr *)udp_in; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); } int udp_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in; struct mbuf *m; struct m_tag *fwd_tag; int cscov_partial, iphlen; m = *mp; iphlen = *offp; ifp = m->m_pkthdr.rcvif; *mp = NULL; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { /* Zero means checksum over the complete packet. */ if (len == 0) len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (proto == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!V_udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return (IPPROTO_DONE); } } else { if (proto == IPPROTO_UDP) { UDPSTAT_INC(udps_nosum); } else { /* UDPLite requires a checksum */ /* XXX: What is the right UDPLite MIB counter here? */ m_freem(m); return (IPPROTO_DONE); } } pcbinfo = get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip_moptions *imo; INP_INFO_RLOCK(pcbinfo); pcblist = get_pcblist(proto); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; INP_RLOCK(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct sockaddr_in group; int blocked; if (imo == NULL) { INP_RUNLOCK(inp); continue; } bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, ifp, (struct sockaddr *)&group, (struct sockaddr *)&udp_in); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { UDP_PROBE(receive, NULL, last, ip, last, uh); udp_append(last, ip, n, iphlen, &udp_in); } INP_RUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); INP_INFO_RUNLOCK(pcbinfo); goto badunlocked; } UDP_PROBE(receive, NULL, last, ip, last, uh); udp_append(last, ip, m, iphlen, &udp_in); INP_RUNLOCK(last); INP_INFO_RUNLOCK(pcbinfo); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return (IPPROTO_DONE); } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen == 0 || up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip, inp, uh); udp_append(inp, ip, m, iphlen, &udp_in); INP_RUNLOCK(inp); return (IPPROTO_DONE); badunlocked: m_freem(m); return (IPPROTO_DONE); } #endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { /* * While udp_ctlinput() always calls udp_notify() with a read lock * when invoking it directly, in_pcbnotifyall() currently uses write * locks due to sharing code with TCP. For now, accept either a read * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } #ifdef INET static void udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; /* * Redirects don't need to be handled up here. */ if (PRC_IS_REDIRECT(cmd)) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } } else in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); } void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo)); } void udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo)); } #endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_udbinfo); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&V_udbinfo); for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_udbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_udbinfo); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); #endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case UDP_ENCAP: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); #endif switch (optval) { case 0: /* Clear all UDP encap. */ #ifdef IPSEC_NAT_T up->u_flags &= ~UF_ESPINUDP_ALL; #endif break; #ifdef IPSEC_NAT_T case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->u_flags &= ~UF_ESPINUDP_ALL; if (optval == UDP_ENCAP_ESPINUDP) up->u_flags |= UF_ESPINUDP; else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE) up->u_flags |= UF_ESPINUDP_NON_IKE; break; #endif default: error = EINVAL; break; } INP_WUNLOCK(inp); break; case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if ((optval != 0 && optval < 8) || (optval > 65535)) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #ifdef IPSEC_NAT_T case UDP_ENCAP: up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); optval = up->u_flags & UF_ESPINUDP_ALL; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #ifdef INET #define UH_WLOCKED 2 #define UH_RLOCKED 1 #define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; int cscov_partial = 0; int error = 0; int ipflags; u_short fport, lport; int unlock_udbinfo; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; - int flowid_type = 0; - int use_flowid = 0; + uint8_t flowtype = M_HASHTYPE_NONE; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; INP_RLOCK(inp); tos = inp->inp_ip_tos; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; case IP_TOS: if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) { error = EINVAL; break; } tos = *(u_char *)CMSG_DATA(cm); break; case IP_FLOWID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid = *(uint32_t *) CMSG_DATA(cm); break; case IP_FLOWTYPE: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } - flowid_type = *(uint32_t *) CMSG_DATA(cm); - use_flowid = 1; + flowtype = *(uint32_t *) CMSG_DATA(cm); break; #ifdef RSS case IP_RSSBUCKETID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } /* This is just a placeholder for now */ break; #endif /* RSS */ default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Depending on whether or not the application has bound or connected * the socket, we may have to do varying levels of work. The optimal * case is for a connected UDP socket, as a global lock isn't * required at all. * * In order to decide which we need, we require stability of the * inpcb binding, which we ensure by acquiring a read lock on the * inpcb. This doesn't strictly follow the lock order, so we play * the trylock and retry game; note that we may end up with more * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. * * XXXRW: Check that hash locking update here is correct. */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = get_inpcbinfo(pr); sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_RUNLOCK(inp); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { INP_HASH_RLOCK(pcbinfo); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { INP_HASH_LOCK_ASSERT(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. */ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); if (pr == IPPROTO_UDPLITE) { struct udpcb *up; uint16_t plen; up = intoudpcb(inp); cscov = up->u_txcslen; plen = (u_short)len + sizeof(struct udphdr); if (cscov >= plen) cscov = 0; ui->ui_len = htons(plen); ui->ui_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else ui->ui_v = IPVERSION << 4; /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= htons(IP_DF); } ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ ui->ui_sum = 0; if (pr == IPPROTO_UDPLITE) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; if (cscov_partial) { if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) ui->ui_sum = 0xffff; } else { if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) ui->ui_sum = 0xffff; } } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); /* * Setup flowid / RSS information for outbound socket. * * Once the UDP code decides to set a flowid some other way, * this allows the flowid to be overridden by userland. */ - if (use_flowid) { - m->m_flags |= M_FLOWID; + if (flowtype != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = flowid; - M_HASHTYPE_SET(m, flowid_type); + M_HASHTYPE_SET(m, flowtype); #ifdef RSS } else { uint32_t hash_val, hash_type; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; - m->m_flags |= M_FLOWID; M_HASHTYPE_SET(m, hash_type); } #endif } #ifdef RSS /* * Don't override with the inp cached flowid value. * * Depending upon the kind of send being done, the inp * flowid/flowtype values may actually not be appropriate * for this particular socket send. * * We should either leave the flowid at zero (which is what is * currently done) or set it to some software generated * hash value based on the packet contents. */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) INP_HASH_RUNLOCK(pcbinfo); UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); if (unlock_udbinfo == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: if (unlock_udbinfo == UH_WLOCKED) { INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { INP_HASH_RUNLOCK(pcbinfo); INP_RUNLOCK(inp); } else INP_RUNLOCK(inp); m_freem(m); return (error); } #if defined(IPSEC) && defined(IPSEC_NAT_T) /* * Potentially decap ESP in UDP frame. Check for an ESP header * and optional marker; if present, strip the UDP header and * push the result through IPSec. * * Returns mbuf to be processed (potentially re-allocated) or * NULL if consumed and/or processed. */ static struct mbuf * udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) { size_t minlen, payload, skip, iphlen; caddr_t data; struct udpcb *up; struct m_tag *tag; struct udphdr *udphdr; struct ip *ip; INP_RLOCK_ASSERT(inp); /* * Pull up data so the longest case is contiguous: * IP/UDP hdr + non ESP marker + ESP hdr. */ minlen = off + sizeof(uint64_t) + sizeof(struct esp); if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if ((m = m_pullup(m, minlen)) == NULL) { IPSECSTAT_INC(ips_in_inval); return (NULL); /* Bypass caller processing. */ } data = mtod(m, caddr_t); /* Points to ip header. */ payload = m->m_len - off; /* Size of payload. */ if (payload == 1 && data[off] == '\xff') return (m); /* NB: keepalive packet, no decap. */ up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0, ("u_flags 0x%x", up->u_flags)); /* * Check that the payload is large enough to hold an * ESP header and compute the amount of data to remove. * * NB: the caller has already done a pullup for us. * XXX can we assume alignment and eliminate bcopys? */ if (up->u_flags & UF_ESPINUDP_NON_IKE) { /* * draft-ietf-ipsec-nat-t-ike-0[01].txt and * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring * possible AH mode non-IKE marker+non-ESP marker * from draft-ietf-ipsec-udp-encaps-00.txt. */ uint64_t marker; if (payload <= sizeof(uint64_t) + sizeof(struct esp)) return (m); /* NB: no decap. */ bcopy(data + off, &marker, sizeof(uint64_t)); if (marker != 0) /* Non-IKE marker. */ return (m); /* NB: no decap. */ skip = sizeof(uint64_t) + sizeof(struct udphdr); } else { uint32_t spi; if (payload <= sizeof(struct esp)) { IPSECSTAT_INC(ips_in_inval); m_freem(m); return (NULL); /* Discard. */ } bcopy(data + off, &spi, sizeof(uint32_t)); if (spi == 0) /* Non-ESP marker. */ return (m); /* NB: no decap. */ skip = sizeof(struct udphdr); } /* * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember * the UDP ports. This is required if we want to select * the right SPD for multiple hosts behind same NAT. * * NB: ports are maintained in network byte order everywhere * in the NAT-T code. */ tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, 2 * sizeof(uint16_t), M_NOWAIT); if (tag == NULL) { IPSECSTAT_INC(ips_in_nomem); m_freem(m); return (NULL); /* Discard. */ } iphlen = off - sizeof(struct udphdr); udphdr = (struct udphdr *)(data + iphlen); ((uint16_t *)(tag + 1))[0] = udphdr->uh_sport; ((uint16_t *)(tag + 1))[1] = udphdr->uh_dport; m_tag_prepend(m, tag); /* * Remove the UDP header (and possibly the non ESP marker) * IP header length is iphlen * Before: * <--- off ---> * +----+------+-----+ * | IP | UDP | ESP | * +----+------+-----+ * <-skip-> * After: * +----+-----+ * | IP | ESP | * +----+-----+ * <-skip-> */ ovbcopy(data, data + skip, iphlen); m_adj(m, skip); ip = mtod(m, struct ip *); ip->ip_len = htons(ntohs(ip->ip_len) - skip); ip->ip_p = IPPROTO_ESP; /* * We cannot yet update the cksums so clear any * h/w cksum flags as they are no longer valid. */ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR); (void) ipsec4_common_input(m, iphlen, ip->ip_p); return (NULL); /* NB: consumed, bypass processing. */ } #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ static void udp_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } #endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, void *ctx) { struct inpcb *inp; struct udpcb *up; KASSERT(so->so_type == SOCK_DGRAM, ("udp_set_kernel_tunneling: !dgram")); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); if (up->u_tun_func != NULL) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } #ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); return (error); } INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } #endif /* INET */ int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } #ifdef INET struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = udp_send, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_shutdown = udp_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; #endif /* INET */ Index: head/sys/netinet6/in6_pcb.c =================================================================== --- head/sys/netinet6/in6_pcb.c (revision 275357) +++ head/sys/netinet6/in6_pcb.c (revision 275358) @@ -1,1287 +1,1287 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *); int in6_pcbbind(register struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int error, lookupflags = 0; int reuseport = (so->so_options & SO_REUSEPORT); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in6_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); } else { sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof(*sin6)) return (EINVAL); /* * family check. */ if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if ((error = prison_local_ip6(cred, &sin6->sin6_addr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); lport = sin6->sin6_port; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == NULL && (inp->inp_flags & INP_BINDANY) == 0) { return (EADDRNOTAVAIL); } /* * XXX: bind to an anycast address might accidentally * cause sending a packet with anycast source address. * We should allow to bind to a deprecated address, since * the application dares to use it. */ if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { ifa_free(ifa); return (EADDRNOTAVAIL); } if (ifa != NULL) ifa_free(ifa); } if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) && (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } #endif } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0) { return (EADDRINUSE); } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, lookupflags, cred); if (t && t->inp_flags & INP_TIMEWAIT) { tw = intotw(t); if (tw == NULL) return (EADDRINUSE); if ((reuseport & tw->tw_so_options) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || ((inp->inp_vflag & INP_IPV6PROTO) == (t->inp_vflag & INP_IPV6PROTO)))) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_vflag & INP_IPV6PROTO) != 0)) return (EADDRINUSE); } #endif } inp->in6p_laddr = sin6->sin6_addr; } if (lport == 0) { if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; return (error); } } else { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } } return (0); } /* * Transform old in6_pcbconnect() into an inner subroutine for new * in6_pcbconnect(): Do some validity-checking on the remote * address (in mbuf 'nam') and then determine local host address * (i.e., which interface) to use to access that remote host. * * This preserves definition of in6_pcbconnect(), while supporting a * slightly different version for T/TCP. (This is more than * a bit of a kludge, but cleaning up the internal interfaces would * have forced minor changes in every protocol). */ static int in6_pcbladdr(register struct inpcb *inp, struct sockaddr *nam, struct in6_addr *plocal_addr6) { register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error = 0; struct ifnet *ifp = NULL; int scope_ambiguous = 0; struct in6_addr in6a; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); /* XXXRW: why? */ if (nam->sa_len != sizeof (*sin6)) return (EINVAL); if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if (!TAILQ_EMPTY(&V_in6_ifaddrhead)) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0) return (error); error = in6_selectsrc(sin6, inp->in6p_outputopts, inp, NULL, inp->inp_cred, &ifp, &in6a); if (error) return (error); if (ifp && scope_ambiguous && (error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) { return(error); } /* * Do not update this earlier, in case we return with an error. * * XXX: this in6_selectsrc result might replace the bound local * address with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ *plocal_addr6 = in6a; /* * Don't do pcblookup call here; return interface in * plocal_addr6 * and exit to caller, that will do the lookup. */ return (0); } /* * Outer subroutine: * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in6_pcbconnect_mbuf(register struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr addr6; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return (error); if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, cred); if (error) return (error); } inp->in6p_laddr = addr6; } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; if (inp->inp_flags & IN6P_AUTOFLOWLABEL) inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash_mbuf(inp, m); return (0); } int in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in6_pcbconnect_mbuf(inp, nam, cred, NULL)); } void in6_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; in_pcbrehash(inp); } struct sockaddr * in6_sockaddr(in_port_t port, struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK); bzero(sin6, sizeof *sin6); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = port; sin6->sin6_addr = *addr_p; (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ return (struct sockaddr *)sin6; } struct sockaddr * in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; bzero(&sin, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = port; sin.sin_addr = *addr_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); in6_sin_2_v4mapsin6(&sin, sin6_p); return (struct sockaddr *)sin6_p; } int in6_getsockaddr(struct socket *so, struct sockaddr **nam) { register struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->in6p_laddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->in6p_faddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getsockaddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif { /* scope issues will be handled in in6_getsockaddr(). */ error = in6_getsockaddr(so, nam); } return error; } int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getpeeraddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif /* scope issues will be handled in in6_getpeeraddr(). */ error = in6_getpeeraddr(so, nam); return error; } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; int errno; if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6) return; sa6_dst = (struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* * Redirects go to all references to the destination, * and use in6_rtchange to invalidate the route cache. * Dead host indications: also use in6_rtchange to invalidate * the cache, and deliver the error to all the sockets. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { fport = 0; lport = 0; bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr)); if (cmd != PRC_HOSTDEAD) notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); if ((inp->inp_vflag & INP_IPV6) == 0) { INP_WUNLOCK(inp); continue; } /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. Note that we notify for all * disconnected sockets if the corresponding application * wanted. This is because some UDP applications keep sending * sockets disconnected. * XXX: should we avoid to notify the value to TCP sockets? */ if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 && (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr))) { ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, (u_int32_t *)cmdarg); } /* * Detect if we should notify the error. If no source and * destination ports are specifed, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr) || inp->inp_socket == 0 || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { INP_WUNLOCK(inp); continue; } do_notify: if (notify) { if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, u_short lport, int lookupflags, struct ucred *cred) { register struct inpcb *inp; int matchwild = 3, wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_WLOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_lport == lport) { /* Found. */ if (cred == NULL || prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) continue; /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) wildcard++; if (!IN6_IS_ADDR_UNSPECIFIED( &inp->in6p_laddr)) { if (IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; else if (!IN6_ARE_ADDR_EQUAL( &inp->in6p_laddr, laddr)) continue; } else { if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *in6p; struct ip6_moptions *im6o; int i, gap; INP_INFO_RLOCK(pcbinfo); LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(in6p); im6o = in6p->in6p_moptions; if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { /* * Unselect the outgoing ifp for multicast if it * is being detached. */ if (im6o->im6o_multicast_ifp == ifp) im6o->im6o_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ gap = 0; for (i = 0; i < im6o->im6o_num_memberships; i++) { if (im6o->im6o_membership[i]->in6m_ifp == ifp) { in6_mc_leave(im6o->im6o_membership[i], NULL); gap++; } else if (gap != 0) { im6o->im6o_membership[i - gap] = im6o->im6o_membership[i]; } } im6o->im6o_num_memberships -= gap; } INP_WUNLOCK(in6p); } INP_INFO_RUNLOCK(pcbinfo); } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in6_losing(struct inpcb *in6p) { /* * We don't store route pointers in the routing table anymore */ return; } /* * After a routing change, flush old routing * and allocate a (hopefully) better one. */ struct inpcb * in6_rtchange(struct inpcb *inp, int errno) { /* * We don't store route pointers in the routing table anymore */ return inp; } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } /* * Then look for a wildcard match in the pcbgroup. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[ INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking buf", __func__); return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list. */ static struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) return (inp); else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ /* * Not found. */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in6_pcbgroup.c. */ struct inpcb * in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && - !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { + M_HASHTYPE_TEST(m, M_HASHTYPE_NONE) == 0) { pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m) { struct ip6_hdr *ip; ip = mtod(m, struct ip6_hdr *); bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_addr = ip->ip6_src; (void)sa6_recoverscope(sin6); /* XXX: should catch errors... */ return; } Index: head/sys/netinet6/ip6_output.c =================================================================== --- head/sys/netinet6/ip6_output.c (revision 275357) +++ head/sys/netinet6/ip6_output.c (revision 275358) @@ -1,2977 +1,2977 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #include #endif /* IPSEC */ #ifdef SCTP #include #include #endif #include #include #ifdef FLOWTABLE #include #endif extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, struct route_in6 *, struct ifnet *, struct in6_addr *, u_long *, int *, u_int); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, and * mp is the destination. */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("%s: delayed m_pullup, m->len: %d plen %u off %u " "csum_flags=%b\n", __func__, m->m_len, plen, offset, (int)m->m_pkthdr.csum_flags, CSUM_BITS); /* * XXX this should not happen, but if it does, the correct * behavior may be to insert the checksum in the appropriate * next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. * * ifpp - XXX: just for statistics */ /* * XXX TODO: no flowid is assigned for outbound flows? */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6, *mhip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev = NULL; int hlen, tlen, len, off; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst, src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; ip6 = mtod(m, struct ip6_hdr *); if (ip6 == NULL) { printf ("ip6 is NULL"); goto bad; } if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (((flags & IP_NODEFAULTFLOWID) == 0) && - (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID))) { + if ((flags & IP_NODEFAULTFLOWID) == 0) { + /* unconditionally set flowid */ m->m_pkthdr.flowid = inp->inp_flowid; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, inp->inp_flowtype); } } finaldst = ip6->ip6_dst; bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. */ switch(ip6_ipsec_output(&m, inp, &flags, &error, &ifp)) { case 1: /* Bad packet */ goto freehdrs; case -1: /* IPSec done */ goto done; case 0: /* No IPSec */ default: break; } #endif /* IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If there is at least one extension header, * separate IP6 header from the payload. */ if (optlen && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ u_char *nexthdrp = &ip6->ip6_nxt; mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); /* * If there is a routing header, discard the packet. */ if (exthdrs.ip6e_rthdr) { error = EINVAL; goto bad; } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } IP6STAT_INC(ip6s_localout); /* * Route packet. */ if (ro == 0) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET6, m, (struct route *)ro); #endif fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if (ro->ro_rt && fwd_tag == NULL) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp, &rt, fibnum); if (error != 0) { if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); counter_u64_add(rt->rt_pksent, 1); } /* * The outgoing interface must be in the zone of source and * destination addresses. */ origifp = ifp; src0 = ip6->ip6_src; if (in6_setscope(&src0, origifp, &zone)) goto badscope; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; if (in6_setscope(&dst0, origifp, &zone)) goto badscope; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { goto badscope; } /* We should use ia_ifp to support the case of * sending packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; /* scope check is done. */ goto routefound; badscope: IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; routefound: if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m, dst); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu, &alwaysfrag, fibnum)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip6 = mtod(m, struct ip6_hdr *); needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else needfiblookup = 1; /* Redo the routing table lookup. */ } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_RTFREE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&ro->ro_dst; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif m->m_pkthdr.csum_flags &= ifp->if_hwassist; tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data as * well as returning an error code (the latter is not described * in the API spec.) */ u_int32_t mtu32; struct ip6ctlparam ip6cp; mtu32 = (u_int32_t)mtu; bzero(&ip6cp, sizeof(ip6cp)); ip6cp.ip6c_cmdarg = (void *)&mtu32; pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst, (void *)&ip6cp); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { struct mbuf **mnext, *m_frgpart; struct ip6_frag *ip6f; u_int32_t id = htonl(ip6_randomid()); u_char nextproto; int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * Verify that we have any chance at all of being able to queue * the packet or packet fragments */ if (qslots <= 0 || ((u_int)qslots * (mtu - hlen) < tlen /* - hlen */)) { error = ENOBUFS; IP6STAT_INC(ip6s_odropped); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif mnext = &m->m_nextpkt; /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; for (off = hlen; off < tlen; off += len) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { error = ENOBUFS; IP6STAT_INC(ip6s_odropped); goto sendorfree; } m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); goto sendorfree; } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + len >= tlen) len = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(len + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copy(m0, off, len)) == 0) { error = ENOBUFS; IP6STAT_INC(ip6s_odropped); goto sendorfree; } m_cat(m, m_frgpart); m->m_pkthdr.len = len + hlen + sizeof(*ip6f); m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum; m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: if (ro == &ip6route) RO_RTFREE(ro); if (ro_pmtu == &ip6route) RO_RTFREE(ro_pmtu); return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == 0) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == 0) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } static int ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, struct ifnet *ifp, struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum) { u_int32_t mtu = 0; int alwaysfrag = 0; int error = 0; if (ro_pmtu != ro) { /* The first hop and the final destination may differ. */ struct sockaddr_in6 *sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (ro_pmtu->ro_rt && ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 || !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) { RTFREE(ro_pmtu->ro_rt); ro_pmtu->ro_rt = (struct rtentry *)NULL; } if (ro_pmtu->ro_rt == NULL) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_rtalloc(ro_pmtu, fibnum); } } if (ro_pmtu->ro_rt) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (ifp == NULL) ifp = ro_pmtu->ro_rt->rt_ifp; ifmtu = IN6_LINKMTU(ifp); mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, ro_pmtu->ro_rt->rt_mtu); else mtu = ro_pmtu->ro_rt->rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEADDR) != 0) in6p->inp_flags2 |= INP_REUSEADDR; else in6p->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(in6p); error = 0; break; case SO_REUSEPORT: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEPORT) != 0) in6p->inp_flags2 |= INP_REUSEPORT; else in6p->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(in6p); error = 0; break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(in6p); error = 0; break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(in6p); \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(in6p); \ in6p->inp_flags |= IN6P_RFC2292; \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) #define OPTSET2(bit, val) do { \ INP_WLOCK(in6p); \ if (val) \ in6p->inp_flags2 |= bit; \ else \ in6p->inp_flags2 &= ~bit; \ INP_WUNLOCK(in6p); \ } while (0) #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_RECVHOPLIMIT: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->inp_vflag &= ~INP_IPV4; else in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { in6p->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; #endif } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(in6p); switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->inp_flags &= ~(INP_HIGHPORT); in6p->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(in6p); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(in6p, optname, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: #ifdef RSS case IPV6_RSSBUCKETID: #endif switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = in6p->inp_flowid; break; case IPV6_FLOWTYPE: optval = in6p->inp_flowtype; break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(in6p->inp_flowid, in6p->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct route_in6 sro; bzero(&sro, sizeof(sro)); if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ error = ip6_getpmtu(&sro, NULL, NULL, &in6p->in6p_faddr, &pmtu, NULL, so->so_fibnum); if (sro.ro_rt) RTFREE(sro.ro_rt); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(in6p, sopt); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m = NULL; struct mbuf **mp = &m; size_t ovalsize = sopt->sopt_valsize; caddr_t oval = (caddr_t)sopt->sopt_val; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; sopt->sopt_valsize = ovalsize; sopt->sopt_val = oval; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(in6p, req, len, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *in6p = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if ((optval % 2) != 0) { /* the API assumes even offset values */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_WAITOK); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { case IPV6_PKTINFO: if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo; else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); optdata = (void *)&null_pktinfo; } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_NEXTHOP: if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop; optdatalen = pktopt->ip6po_nexthop->sa_len; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) optdata = (void *)&pktopt->ip6po_prefer_tempaddr; else optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif return (ENOPROTOOPT); } error = sooptcopyout(sopt, optdata, optdatalen); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL && canwait == M_NOWAIT)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = 0; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && ( ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in6 *dst) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copy(m, 0, M_COPYALL); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } #ifdef DIAGNOSTIC if (copym->m_len < sizeof(*ip6)) { m_freem(copym); return; } #endif ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); (void)if_simloop(ifp, copym, dst->sin6_family, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); MH_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *in6p) { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen } Index: head/sys/netinet6/udp6_usrreq.c =================================================================== --- head/sys/netinet6/udp6_usrreq.c (revision 275357) +++ head/sys/netinet6/udp6_usrreq.c (revision 275358) @@ -1,1251 +1,1250 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ * $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ extern struct protosw inetsw[]; static void udp6_detach(struct socket *so); static void udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { struct socket *so; struct mbuf *opts; struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { (*up->u_tun_func)(n, off, inp, (struct sockaddr *)fromsa, up->u_tun_ctx); return; } #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec6_in_reject(n, inp)) { m_freem(n); IPSEC6STAT_INC(ips_in_polvio); return; } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return; } #endif opts = NULL; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); m_adj(n, off + sizeof(struct udphdr)); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ifnet *ifp; struct ip6_hdr *ip6; struct udphdr *uh; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; int off = *offp; int cscov_partial; int plen, ulen; struct sockaddr_in6 fromsa; struct m_tag *fwd_tag; uint16_t uh_sum; uint8_t nxt; ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh)); if (!uh) return (IPPROTO_DONE); #endif UDPSTAT_INC(udps_ipackets); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); ulen = ntohs((u_short)uh->uh_ulen); nxt = ip6->ip6_nxt; cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0; if (nxt == IPPROTO_UDPLITE) { /* Zero means checksum over the complete packet. */ if (ulen == 0) ulen = plen; if (ulen == plen) cscov_partial = 0; if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } if (uh->uh_sum == 0) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } } else { if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (uh->uh_sum == 0) { UDPSTAT_INC(udps_nosum); goto badunlocked; } } if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); if (uh_sum != 0) { UDPSTAT_INC(udps_badsum); goto badunlocked; } /* * Construct sockaddr format source address. */ init_sin6(&fromsa, m); fromsa.sin6_port = uh->uh_sport; pcbinfo = get_inpcbinfo(nxt); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip6_moptions *imo; INP_INFO_RLOCK(pcbinfo); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address * specified in the received packet will not match laddr. To * handle this situation, matching is relaxed if the * receiving interface is the same as one specified in the * socket and if the destination multicast address matches * one of the multicast groups specified in the socket. */ /* * KAME note: traditionally we dropped udpiphdr from mbuf * here. We need udphdr for IPsec processing so we do that * later. */ pcblist = get_pcblist(nxt); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->inp_lport != uh->uh_dport) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src) || inp->inp_fport != uh->uh_sport) continue; } /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is (supposed to be) held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->in6p_moptions; if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct sockaddr_in6 mcaddr; int blocked; INP_RLOCK(inp); bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(imo, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IP6STAT_INC(ip6s_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); /* XXX */ continue; } INP_RUNLOCK(inp); } if (last != NULL) { struct mbuf *n; if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { INP_RLOCK(last); UDP_PROBE(receive, NULL, last, ip6, last, uh); udp6_append(last, n, off, &fromsa); INP_RUNLOCK(last); } } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); UDPSTAT_INC(udps_noportmcast); goto badheadlocked; } INP_RLOCK(last); INP_INFO_RUNLOCK(pcbinfo); UDP_PROBE(receive, NULL, last, ip6, last, uh); udp6_append(last, m, off, &fromsa); INP_RUNLOCK(last); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(pcbinfo, &ip6->ip6_src, uh->uh_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? htons(next_hop6->sin6_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP6_NEXTHOP; } else inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { if (udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP [%s]:%d from [%s]:%d\n", ip6_sprintf(ip6bufd, &ip6->ip6_dst), ntohs(uh->uh_dport), ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); } INP_RLOCK_ASSERT(inp); up = intoudpcb(inp); if (cscov_partial) { if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip6, inp, uh); udp6_append(inp, m, off, &fromsa); INP_RUNLOCK(inp); return (IPPROTO_DONE); badheadlocked: INP_INFO_RUNLOCK(pcbinfo); badunlocked: if (m) m_freem(m); return (IPPROTO_DONE); } static void udp6_common_ctlinput(int cmd, struct sockaddr *sa, void *d, struct inpcbinfo *pcbinfo) { struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } if (ip6) { /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* Check if we can safely examine src and dst ports. */ if (m->m_pkthdr.len < off + sizeof(*uhp)) return; bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); (void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport, (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd, cmdarg, notify); } else (void)in6_pcbnotify(pcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } void udp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo)); } void udplite6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo)); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) return (EINVAL); if (req->oldlen != sizeof(struct xucred)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); static int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; struct sockaddr_in6 *sin6 = NULL; struct ifnet *oifp = NULL; int cscov_partial = 0; int scope_ambiguous = 0; u_short fport; int error = 0; uint8_t nxt; uint16_t cscov = 0; struct ip6_pktopts *optp, opt; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (addr6) { /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return (error); } if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, IPPROTO_UDP)) != 0) goto release; optp = &opt; } else optp = inp->in6p_outputopts; if (sin6) { faddr = &sin6->sin6_addr; /* * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } fport = sin6->sin6_port; /* allow 0 port */ if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * I believe we should explicitly discard the * packet when mapped addresses are disabled, * rather than send the packet as an IPv6 one. * If we chose the latter approach, the packet * might be sent out on the wire based on the * default route, the situation which we'd * probably want to avoid. * (20010421 jinmei@kame.net) */ error = EINVAL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { /* * when remote addr is an IPv4-mapped address, * local addr should not be an IPv6 address, * since you cannot determine how to map IPv6 * source address to IPv4. */ error = EINVAL; goto release; } af = AF_INET; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { error = in6_selectsrc(sin6, optp, inp, NULL, td->td_ucred, &oifp, &in6a); if (error) goto release; if (oifp && scope_ambiguous && (error = in6_setscope(&sin6->sin6_addr, oifp, NULL))) { goto release; } laddr = &in6a; } else laddr = &inp->in6p_laddr; /* XXX */ if (laddr == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto release; } if (inp->inp_lport == 0 && (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; goto release; } } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after * connecting the foreign address. * Such applications should be fixed, * so we bark here. */ log(LOG_INFO, "udp6_output: IPV6_V6ONLY " "option was set for a connected socket\n"); error = EINVAL; goto release; } else af = AF_INET; } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->inp_fport; } if (af == AF_INET) hlen = sizeof(struct ip); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT); if (m == 0) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (nxt == IPPROTO_UDPLITE) { struct udpcb *up; up = intoudpcb(inp); cscov = up->u_txcslen; if (cscov >= plen) cscov = 0; udp6->uh_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; switch (af) { case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)plen); ip6->ip6_nxt = nxt; ip6->ip6_hlim = in6_selecthlim(inp, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; if (cscov_partial) { if ((udp6->uh_sum = in6_cksum_partial(m, nxt, sizeof(struct ip6_hdr), plen, cscov)) == 0) udp6->uh_sum = 0xffff; } else { udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } /* * XXX for now assume UDP is 2-tuple. * Later on this may become configurable as 4-tuple; * we should support that. * * XXX .. and we should likely cache this in the inpcb. */ #ifdef RSS m->m_pkthdr.flowid = rss_hash_ip6_2tuple(*faddr, *laddr); - m->m_flags |= M_FLOWID; M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV6); #endif flags = 0; #ifdef RSS /* * Don't override with the inp cached flowid. * * Until the whole UDP path is vetted, it may actually * be incorrect. */ flags |= IP_NODEFAULTFLOWID; #endif UDP_PROBE(send, NULL, inp, ip6, inp, udp6); UDPSTAT_INC(udps_opackets); error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions, NULL, inp); break; case AF_INET: error = EAFNOSUPPORT; goto release; } goto releaseopt; release: m_freem(m); releaseopt: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); } static void udp6_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_abort: inp == NULL")); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; (*pru->pru_abort)(so); return; } #endif INP_WLOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp6_attach: inp != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); } INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; /* just to be sure */ /* * XXX: ugly!! * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; #ifdef INET else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } #endif } error = in6_pcbbind(inp, nam, td->td_ucred); #ifdef INET out: #endif INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp6_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_close: inp == NULL")); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; (*pru->pru_disconnect)(so); return; } #endif INP_WLOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); sin6 = (struct sockaddr_in6 *)nam; KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); /* * XXXRW: Need to clarify locking of v4/v6 flags. */ INP_WLOCK(inp); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); goto out; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); out: INP_WUNLOCK(inp); return (error); } static void udp6_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp6_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; (void)(*pru->pru_disconnect)(so); return (0); } #endif INP_WLOCK(inp); if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto out; } INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); out: INP_WUNLOCK(inp); return (0); } static int udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error = 0; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; goto bad; } if (addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto bad; } } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; struct sockaddr_in6 *sin6 = 0; if (addr == 0) hasv4addr = (inp->inp_vflag & INP_IPV4); else { sin6 = (struct sockaddr_in6 *)addr; hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; } if (hasv4addr) { struct pr_usrreqs *pru; /* * XXXRW: We release UDP-layer locks before calling * udp_send() in order to avoid recursion. However, * this does mean there is a short window where inp's * fields are unstable. Could this lead to a * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ INP_WUNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock(addr); pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs; /* addr will just be freed in sendit(). */ return ((*pru->pru_send)(so, flags, m, addr, control, td)); } } #endif #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif INP_HASH_WLOCK(pcbinfo); error = udp6_output(inp, m, addr, control, td); INP_HASH_WUNLOCK(pcbinfo); #ifdef INET #endif INP_WUNLOCK(inp); return (error); bad: INP_WUNLOCK(inp); m_freem(m); return (error); } struct pr_usrreqs udp6_usrreqs = { .pru_abort = udp6_abort, .pru_attach = udp6_attach, .pru_bind = udp6_bind, .pru_connect = udp6_connect, .pru_control = in6_control, .pru_detach = udp6_detach, .pru_disconnect = udp6_disconnect, .pru_peeraddr = in6_mapped_peeraddr, .pru_send = udp6_send, .pru_shutdown = udp_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp6_close }; Index: head/sys/ofed/drivers/net/mlx4/en_rx.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_rx.c (revision 275357) +++ head/sys/ofed/drivers/net/mlx4/en_rx.c (revision 275358) @@ -1,901 +1,901 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include "opt_inet.h" #include #include #include #include #include #include #include #ifdef CONFIG_NET_RX_BUSY_POLL #include #endif #include "mlx4_en.h" static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index; int possible_frags; int i; /* Set size and memtype fields */ for (i = 0; i < priv->num_frags; i++) { rx_desc->data[i].byte_count = cpu_to_be32(priv->frag_info[i].frag_size); rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key); } /* If the number of used fragments does not fill up the ring stride, * * remaining (unused) fragments must be padded with null address/size * * and a special memory key */ possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE; for (i = priv->num_frags; i < possible_frags; i++) { rx_desc->data[i].byte_count = 0; rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD); rx_desc->data[i].addr = 0; } } static int mlx4_en_alloc_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, struct mbuf **mb_list, int i) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; struct mbuf *mb; dma_addr_t dma; if (i == 0) mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, frag_info->frag_size); else mb = m_getjcl(M_NOWAIT, MT_DATA, 0, frag_info->frag_size); if (mb == NULL) { priv->port_stats.rx_alloc_failed++; return -ENOMEM; } dma = pci_map_single(mdev->pdev, mb->m_data, frag_info->frag_size, PCI_DMA_FROMDEVICE); rx_desc->data[i].addr = cpu_to_be64(dma); mb_list[i] = mb; return 0; } static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride); struct mbuf **mb_list = ring->rx_info + (index << priv->log_rx_info); int i; for (i = 0; i < priv->num_frags; i++) if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, i)) goto err; return 0; err: while (i--) m_free(mb_list[i]); return -ENOMEM; } static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring) { *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff); } static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_frag_info *frag_info; struct mlx4_en_dev *mdev = priv->mdev; struct mbuf **mb_list; struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride); dma_addr_t dma; int nr; mb_list = ring->rx_info + (index << priv->log_rx_info); for (nr = 0; nr < priv->num_frags; nr++) { en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); frag_info = &priv->frag_info[nr]; dma = be64_to_cpu(rx_desc->data[nr].addr); #if BITS_PER_LONG == 64 en_dbg(DRV, priv, "Unmaping buffer at dma:0x%lx\n", (u64) dma); #elif BITS_PER_LONG == 32 en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma); #endif pci_unmap_single(mdev->pdev, dma, frag_info->frag_size, PCI_DMA_FROMDEVICE); m_free(mb_list[nr]); } } static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) { struct mlx4_en_rx_ring *ring; int ring_ind; int buf_ind; int new_size; int err; for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) { for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; err = mlx4_en_prepare_rx_desc(priv, ring, ring->actual_size); if (err) { if (ring->actual_size == 0) { en_err(priv, "Failed to allocate " "enough rx buffers\n"); return -ENOMEM; } else { new_size = rounddown_pow_of_two(ring->actual_size); en_warn(priv, "Only %d buffers allocated " "reducing ring size to %d\n", ring->actual_size, new_size); goto reduce_rings; } } ring->actual_size++; ring->prod++; } } return 0; reduce_rings: for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; while (ring->actual_size > new_size) { ring->actual_size--; ring->prod--; mlx4_en_free_rx_desc(priv, ring, ring->actual_size); } } return 0; } static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { int index; en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n", ring->cons, ring->prod); /* Unmap and free Rx buffers */ BUG_ON((u32) (ring->prod - ring->cons) > ring->actual_size); while (ring->cons != ring->prod) { index = ring->cons & ring->size_mask; en_dbg(DRV, priv, "Processing descriptor:%d\n", index); mlx4_en_free_rx_desc(priv, ring, index); ++ring->cons; } } #if MLX4_EN_MAX_RX_FRAGS == 3 static int frag_sizes[] = { FRAG_SZ0, FRAG_SZ1, FRAG_SZ2, }; #elif MLX4_EN_MAX_RX_FRAGS == 2 static int frag_sizes[] = { FRAG_SZ0, FRAG_SZ1, }; #else #error "Unknown MAX_RX_FRAGS" #endif void mlx4_en_calc_rx_buf(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); int eff_mtu = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN; int buf_size = 0; int i, frag; for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) { /* * Allocate small to large but only as much as is needed for * the tail. */ while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1]) i--; priv->frag_info[frag].frag_size = frag_sizes[i]; priv->frag_info[frag].frag_prefix_size = buf_size; buf_size += priv->frag_info[frag].frag_size; } priv->num_frags = frag; priv->rx_mb_size = eff_mtu; priv->log_rx_info = ROUNDUP_LOG2(priv->num_frags * sizeof(struct mbuf *)); en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d " "num_frags:%d):\n", eff_mtu, priv->num_frags); for (i = 0; i < priv->num_frags; i++) { en_dbg(DRV, priv, " frag:%d - size:%d prefix:%d\n", i, priv->frag_info[i].frag_size, priv->frag_info[i].frag_prefix_size); } } int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size, int node) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring; int err = -ENOMEM; int tmp; ring = kzalloc(sizeof(struct mlx4_en_rx_ring), GFP_KERNEL); if (!ring) { en_err(priv, "Failed to allocate RX ring structure\n"); return -ENOMEM; } ring->prod = 0; ring->cons = 0; ring->size = size; ring->size_mask = size - 1; ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + DS_SIZE * MLX4_EN_MAX_RX_FRAGS); ring->log_stride = ffs(ring->stride) - 1; ring->buf_size = ring->size * ring->stride + TXBB_SIZE; tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * sizeof(struct mbuf *)); ring->rx_info = kmalloc(tmp, GFP_KERNEL); if (!ring->rx_info) { err = -ENOMEM; goto err_ring; } en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n", ring->rx_info, tmp); err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); if (err) goto err_info; err = mlx4_en_map_buffer(&ring->wqres.buf); if (err) { en_err(priv, "Failed to map RX buffer\n"); goto err_hwq; } ring->buf = ring->wqres.buf.direct.buf; *pring = ring; return 0; err_hwq: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_info: vfree(ring->rx_info); err_ring: kfree(ring); return err; } int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv) { struct mlx4_en_rx_ring *ring; int i; int ring_ind; int err; int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + DS_SIZE * priv->num_frags); for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; ring->prod = 0; ring->cons = 0; ring->actual_size = 0; ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn; ring->rx_alloc_order = priv->rx_alloc_order; ring->rx_alloc_size = priv->rx_alloc_size; ring->rx_buf_size = priv->rx_buf_size; ring->rx_mb_size = priv->rx_mb_size; ring->stride = stride; if (ring->stride <= TXBB_SIZE) ring->buf += TXBB_SIZE; ring->log_stride = ffs(ring->stride) - 1; ring->buf_size = ring->size * ring->stride; memset(ring->buf, 0, ring->buf_size); mlx4_en_update_rx_prod_db(ring); /* Initialize all descriptors */ for (i = 0; i < ring->size; i++) mlx4_en_init_rx_desc(priv, ring, i); #ifdef INET /* Configure lro mngr */ if (priv->dev->if_capenable & IFCAP_LRO) { if (tcp_lro_init(&ring->lro)) priv->dev->if_capenable &= ~IFCAP_LRO; else ring->lro.ifp = priv->dev; } #endif } err = mlx4_en_fill_rx_buffers(priv); if (err) goto err_buffers; for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; ring->size_mask = ring->actual_size - 1; mlx4_en_update_rx_prod_db(ring); } return 0; err_buffers: for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]); ring_ind = priv->rx_ring_num - 1; while (ring_ind >= 0) { ring = priv->rx_ring[ring_ind]; if (ring->stride <= TXBB_SIZE) ring->buf -= TXBB_SIZE; ring_ind--; } return err; } void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size, u16 stride) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring = *pring; mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE); vfree(ring->rx_info); kfree(ring); *pring = NULL; #ifdef CONFIG_RFS_ACCEL mlx4_en_cleanup_filters(priv, ring); #endif } void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { #ifdef INET tcp_lro_free(&ring->lro); #endif mlx4_en_free_rx_buf(priv, ring); if (ring->stride <= TXBB_SIZE) ring->buf -= TXBB_SIZE; } static void validate_loopback(struct mlx4_en_priv *priv, struct mbuf *mb) { int i; int offset = ETHER_HDR_LEN; for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) { if (*(mb->m_data + offset) != (unsigned char) (i & 0xff)) goto out_loopback; } /* Loopback found */ priv->loopback_ok = 1; out_loopback: m_freem(mb); } static inline int invalid_cqe(struct mlx4_en_priv *priv, struct mlx4_cqe *cqe) { /* Drop packet on bad receive or bad checksum */ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR)) { en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n", ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome, ((struct mlx4_err_cqe *)cqe)->syndrome); return 1; } if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) { en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n"); return 1; } return 0; } /* Unmap a completed descriptor and free unused pages */ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, struct mbuf **mb_list, int length) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_frag_info *frag_info; dma_addr_t dma; struct mbuf *mb; int nr; mb = mb_list[0]; mb->m_pkthdr.len = length; /* Collect used fragments while replacing them in the HW descirptors */ for (nr = 0; nr < priv->num_frags; nr++) { frag_info = &priv->frag_info[nr]; if (length <= frag_info->frag_prefix_size) break; if (nr) mb->m_next = mb_list[nr]; mb = mb_list[nr]; mb->m_len = frag_info[nr].frag_size; dma = be64_to_cpu(rx_desc->data[nr].addr); /* Allocate a replacement page */ if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, nr)) goto fail; /* Unmap buffer */ pci_unmap_single(mdev->pdev, dma, frag_info[nr].frag_size, PCI_DMA_FROMDEVICE); } /* Adjust size of last fragment to match actual length */ mb->m_len = length - priv->frag_info[nr - 1].frag_prefix_size; mb->m_next = NULL; return 0; fail: /* Drop all accumulated fragments (which have already been replaced in * the descriptor) of this packet; remaining fragments are reused... */ while (nr > 0) { nr--; m_free(mb_list[nr]); } return -ENOMEM; } static struct mbuf *mlx4_en_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, struct mbuf **mb_list, unsigned int length) { struct mbuf *mb; mb = mb_list[0]; /* Move relevant fragments to mb */ if (unlikely(mlx4_en_complete_rx_desc(priv, rx_desc, mb_list, length))) return NULL; return mb; } /* For cpu arch with cache line of 64B the performance is better when cqe size==64B * To enlarge cqe size from 32B to 64B --> 32B of garbage (i.e. 0xccccccc) * was added in the beginning of each cqe (the real data is in the corresponding 32B). * The following calc ensures that when factor==1, it means we are alligned to 64B * and we get the real cqe data*/ #define CQE_FACTOR_INDEX(index, factor) ((index << factor) + factor) int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cqe *cqe; struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring]; struct mbuf **mb_list; struct mlx4_en_rx_desc *rx_desc; struct mbuf *mb; struct mlx4_cq *mcq = &cq->mcq; struct mlx4_cqe *buf = cq->buf; #ifdef INET struct lro_entry *queued; #endif int index; unsigned int length; int polled = 0; u32 cons_index = mcq->cons_index; u32 size_mask = ring->size_mask; int size = cq->size; int factor = priv->cqe_factor; if (!priv->port_up) return 0; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx * descriptor offset can be deducted from the CQE index instead of * reading 'cqe->index' */ index = cons_index & size_mask; cqe = &buf[CQE_FACTOR_INDEX(index, factor)]; /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { mb_list = ring->rx_info + (index << priv->log_rx_info); rx_desc = ring->buf + (index << ring->log_stride); /* * make sure we read the CQE after we read the ownership bit */ rmb(); if (invalid_cqe(priv, cqe)) { goto next; } /* * Packet is OK - process it. */ length = be32_to_cpu(cqe->byte_cnt); length -= ring->fcs_del; mb = mlx4_en_rx_mb(priv, rx_desc, mb_list, length); if (!mb) { ring->errors++; goto next; } ring->bytes += length; ring->packets++; if (unlikely(priv->validate_loopback)) { validate_loopback(priv, mb); goto next; } mb->m_pkthdr.flowid = cq->ring; - mb->m_flags |= M_FLOWID; + M_HASHTYPE_SET(mb, M_HASHTYPE_OPAQUE); mb->m_pkthdr.rcvif = dev; if (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_VLAN_PRESENT_MASK) { mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid); mb->m_flags |= M_VLANTAG; } if (likely(dev->if_capabilities & IFCAP_RXCSUM) && (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && (cqe->checksum == cpu_to_be16(0xffff))) { priv->port_stats.rx_chksum_good++; mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mb->m_pkthdr.csum_data = htons(0xffff); /* This packet is eligible for LRO if it is: * - DIX Ethernet (type interpretation) * - TCP/IP (v4) * - without IP options * - not an IP fragment */ #ifdef INET if (mlx4_en_can_lro(cqe->status) && (dev->if_capenable & IFCAP_LRO)) { if (ring->lro.lro_cnt != 0 && tcp_lro_rx(&ring->lro, mb, 0) == 0) goto next; } #endif /* LRO not possible, complete processing here */ INC_PERF_COUNTER(priv->pstats.lro_misses); } else { mb->m_pkthdr.csum_flags = 0; priv->port_stats.rx_chksum_none++; } /* Push it up the stack */ dev->if_input(dev, mb); next: ++cons_index; index = cons_index & size_mask; cqe = &buf[CQE_FACTOR_INDEX(index, factor)]; if (++polled == budget) goto out; } /* Flush all pending IP reassembly sessions */ out: #ifdef INET while ((queued = SLIST_FIRST(&ring->lro.lro_active)) != NULL) { SLIST_REMOVE_HEAD(&ring->lro.lro_active, next); tcp_lro_flush(&ring->lro, queued); } #endif AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); mcq->cons_index = cons_index; mlx4_cq_set_ci(mcq); wmb(); /* ensure HW sees CQ consumer before we post new buffers */ ring->cons = mcq->cons_index; ring->prod += polled; /* Polled descriptors were realocated in place */ mlx4_en_update_rx_prod_db(ring); return polled; } /* Rx CQ polling - called by NAPI */ static int mlx4_en_poll_rx_cq(struct mlx4_en_cq *cq, int budget) { struct net_device *dev = cq->dev; int done; done = mlx4_en_process_rx_cq(dev, cq, budget); cq->tot_rx += done; return done; } void mlx4_en_rx_irq(struct mlx4_cq *mcq) { struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); struct mlx4_en_priv *priv = netdev_priv(cq->dev); int done; // Shoot one within the irq context // Because there is no NAPI in freeBSD done = mlx4_en_poll_rx_cq(cq, MLX4_EN_RX_BUDGET); if (priv->port_up && (done == MLX4_EN_RX_BUDGET) ) { taskqueue_enqueue(cq->tq, &cq->cq_task); } else { mlx4_en_arm_cq(priv, cq); } } void mlx4_en_rx_que(void *context, int pending) { struct mlx4_en_cq *cq; cq = context; while (mlx4_en_poll_rx_cq(cq, MLX4_EN_RX_BUDGET) == MLX4_EN_RX_BUDGET); mlx4_en_arm_cq(cq->dev->if_softc, cq); } /* RSS related functions */ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn, struct mlx4_en_rx_ring *ring, enum mlx4_qp_state *state, struct mlx4_qp *qp) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_qp_context *context; int err = 0; context = kmalloc(sizeof *context , GFP_KERNEL); if (!context) { en_err(priv, "Failed to allocate qp context\n"); return -ENOMEM; } err = mlx4_qp_alloc(mdev->dev, qpn, qp); if (err) { en_err(priv, "Failed to allocate qp #%x\n", qpn); goto out; } qp->event = mlx4_en_sqp_event; memset(context, 0, sizeof *context); mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0, qpn, ring->cqn, -1, context); context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma); /* Cancel FCS removal if FW allows */ if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) { context->param3 |= cpu_to_be32(1 << 29); ring->fcs_del = ETH_FCS_LEN; } else ring->fcs_del = 0; err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state); if (err) { mlx4_qp_remove(mdev->dev, qp); mlx4_qp_free(mdev->dev, qp); } mlx4_en_update_rx_prod_db(ring); out: kfree(context); return err; } int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv) { int err; u32 qpn; err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, 0); if (err) { en_err(priv, "Failed reserving drop qpn\n"); return err; } err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp); if (err) { en_err(priv, "Failed allocating drop qp\n"); mlx4_qp_release_range(priv->mdev->dev, qpn, 1); return err; } return 0; } void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv) { u32 qpn; qpn = priv->drop_qp.qpn; mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp); mlx4_qp_free(priv->mdev->dev, &priv->drop_qp); mlx4_qp_release_range(priv->mdev->dev, qpn, 1); } /* Allocate rx qp's and configure them according to rss map */ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rss_map *rss_map = &priv->rss_map; struct mlx4_qp_context context; struct mlx4_rss_context *rss_context; int rss_rings; void *ptr; u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 | MLX4_RSS_TCP_IPV6); int i; int err = 0; int good_qps = 0; static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC, 0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD, 0x593D56D9, 0xF3253C06, 0x2ADC1FFC}; en_dbg(DRV, priv, "Configuring rss steering\n"); err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num, priv->rx_ring_num, &rss_map->base_qpn, 0); if (err) { en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num); return err; } for (i = 0; i < priv->rx_ring_num; i++) { priv->rx_ring[i]->qpn = rss_map->base_qpn + i; err = mlx4_en_config_rss_qp(priv, priv->rx_ring[i]->qpn, priv->rx_ring[i], &rss_map->state[i], &rss_map->qps[i]); if (err) goto rss_err; ++good_qps; } /* Configure RSS indirection qp */ err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp); if (err) { en_err(priv, "Failed to allocate RSS indirection QP\n"); goto rss_err; } rss_map->indir_qp.event = mlx4_en_sqp_event; mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn, priv->rx_ring[0]->cqn, -1, &context); if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num) rss_rings = priv->rx_ring_num; else rss_rings = priv->prof->rss_rings; ptr = ((void *) &context) + offsetof(struct mlx4_qp_context, pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH; rss_context = ptr; rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 | (rss_map->base_qpn)); rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn); if (priv->mdev->profile.udp_rss) { rss_mask |= MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6; rss_context->base_qpn_udp = rss_context->default_qpn; } rss_context->flags = rss_mask; rss_context->hash_fn = MLX4_RSS_HASH_TOP; for (i = 0; i < 10; i++) rss_context->rss_key[i] = cpu_to_be32(rsskey[i]); err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context, &rss_map->indir_qp, &rss_map->indir_state); if (err) goto indir_err; return 0; indir_err: mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state, MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); mlx4_qp_free(mdev->dev, &rss_map->indir_qp); rss_err: for (i = 0; i < good_qps; i++) { mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]); mlx4_qp_remove(mdev->dev, &rss_map->qps[i]); mlx4_qp_free(mdev->dev, &rss_map->qps[i]); } mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num); return err; } void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rss_map *rss_map = &priv->rss_map; int i; mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state, MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); mlx4_qp_free(mdev->dev, &rss_map->indir_qp); for (i = 0; i < priv->rx_ring_num; i++) { mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]); mlx4_qp_remove(mdev->dev, &rss_map->qps[i]); mlx4_qp_free(mdev->dev, &rss_map->qps[i]); } mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num); } Index: head/sys/ofed/drivers/net/mlx4/en_tx.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_tx.c (revision 275357) +++ head/sys/ofed/drivers/net/mlx4/en_tx.c (revision 275358) @@ -1,1112 +1,1111 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_en.h" #include "utils.h" enum { MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */ MAX_BF = 256, MIN_PKT_LEN = 17, }; static int inline_thold __read_mostly = MAX_INLINE; module_param_named(inline_thold, inline_thold, uint, 0444); MODULE_PARM_DESC(inline_thold, "threshold for using inline data"); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring, u32 size, u16 stride, int node, int queue_idx) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring; int tmp; int err; ring = kzalloc_node(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL, node); if (!ring) { ring = kzalloc(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL); if (!ring) { en_err(priv, "Failed allocating TX ring\n"); return -ENOMEM; } } ring->size = size; ring->size_mask = size - 1; ring->stride = stride; ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS; ring->inline_thold = min(inline_thold, MAX_INLINE); mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF); mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF); /* Allocate the buf ring */ ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF, M_WAITOK, &ring->tx_lock.m); if (ring->br == NULL) { en_err(priv, "Failed allocating tx_info ring\n"); return -ENOMEM; } tmp = size * sizeof(struct mlx4_en_tx_info); ring->tx_info = vmalloc_node(tmp, node); if (!ring->tx_info) { ring->tx_info = vmalloc(tmp); if (!ring->tx_info) { err = -ENOMEM; goto err_ring; } } en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", ring->tx_info, tmp); ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node); if (!ring->bounce_buf) { ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL); if (!ring->bounce_buf) { err = -ENOMEM; goto err_info; } } ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE); /* Allocate HW buffers on provided NUMA node */ err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); if (err) { en_err(priv, "Failed allocating hwq resources\n"); goto err_bounce; } err = mlx4_en_map_buffer(&ring->wqres.buf); if (err) { en_err(priv, "Failed to map TX buffer\n"); goto err_hwq_res; } ring->buf = ring->wqres.buf.direct.buf; en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d " "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size, ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map); err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn, MLX4_RESERVE_BF_QP); if (err) { en_err(priv, "failed reserving qp for TX ring\n"); goto err_map; } err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp); if (err) { en_err(priv, "Failed allocating qp %d\n", ring->qpn); goto err_reserve; } ring->qp.event = mlx4_en_sqp_event; err = mlx4_bf_alloc(mdev->dev, &ring->bf, node); if (err) { en_dbg(DRV, priv, "working without blueflame (%d)", err); ring->bf.uar = &mdev->priv_uar; ring->bf.uar->map = mdev->uar_map; ring->bf_enabled = false; } else ring->bf_enabled = true; ring->queue_index = queue_idx; if (queue_idx < priv->num_tx_rings_p_up ) CPU_SET(queue_idx, &ring->affinity_mask); *pring = ring; return 0; err_reserve: mlx4_qp_release_range(mdev->dev, ring->qpn, 1); err_map: mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq_res: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_bounce: kfree(ring->bounce_buf); err_info: vfree(ring->tx_info); err_ring: buf_ring_free(ring->br, M_DEVBUF); kfree(ring); return err; } void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring = *pring; en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); buf_ring_free(ring->br, M_DEVBUF); if (ring->bf_enabled) mlx4_bf_free(mdev->dev, &ring->bf); mlx4_qp_remove(mdev->dev, &ring->qp); mlx4_qp_free(mdev->dev, &ring->qp); mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1); mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); kfree(ring->bounce_buf); vfree(ring->tx_info); mtx_destroy(&ring->tx_lock.m); mtx_destroy(&ring->comp_lock.m); kfree(ring); *pring = NULL; } int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int cq, int user_prio) { struct mlx4_en_dev *mdev = priv->mdev; int err; ring->cqn = cq; ring->prod = 0; ring->cons = 0xffffffff; ring->last_nr_txbb = 1; ring->poll_cnt = 0; ring->blocked = 0; memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); memset(ring->buf, 0, ring->buf_size); ring->qp_state = MLX4_QP_STATE_RST; ring->doorbell_qpn = ring->qp.qpn << 8; mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, ring->cqn, user_prio, &ring->context); if (ring->bf_enabled) ring->context.usr_page = cpu_to_be32(ring->bf.uar->index); err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context, &ring->qp, &ring->qp_state); return err; } void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring) { struct mlx4_en_dev *mdev = priv->mdev; mlx4_qp_modify(mdev->dev, NULL, ring->qp_state, MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp); } static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int index, u8 owner) { struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; void *end = ring->buf + ring->buf_size; __be32 *ptr = (__be32 *)tx_desc; __be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT)); int i; /* Optimize the common case when there are no wraparounds */ if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) /* Stamp the freed descriptor */ for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { *ptr = stamp; ptr += STAMP_DWORDS; } else /* Stamp the freed descriptor */ for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { *ptr = stamp; ptr += STAMP_DWORDS; if ((void *)ptr >= end) { ptr = ring->buf; stamp ^= cpu_to_be32(0x80000000); } } } static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int index, u8 owner, u64 timestamp) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; struct mbuf *mb = tx_info->mb; void *end = ring->buf + ring->buf_size; int frags = tx_info->nr_segs;; int i; /* Optimize the common case when there are no wraparounds */ if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { if (!tx_info->inl) { if (tx_info->linear) { dma_unmap_single(priv->ddev, (dma_addr_t) be64_to_cpu(data->addr), be32_to_cpu(data->byte_count), PCI_DMA_TODEVICE); ++data; } for (i = 0; i < frags; i++) { pci_unmap_single(mdev->pdev, (dma_addr_t) be64_to_cpu(data[i].addr), data[i].byte_count, PCI_DMA_TODEVICE); } } } else { if (!tx_info->inl) { if ((void *) data >= end) { data = ring->buf + ((void *)data - end); } if (tx_info->linear) { dma_unmap_single(priv->ddev, (dma_addr_t) be64_to_cpu(data->addr), be32_to_cpu(data->byte_count), PCI_DMA_TODEVICE); ++data; } for (i = 0; i < frags; i++) { /* Check for wraparound before unmapping */ if ((void *) data >= end) data = ring->buf; pci_unmap_single(mdev->pdev, (dma_addr_t) be64_to_cpu(data->addr), data->byte_count, PCI_DMA_TODEVICE); ++data; } } } /* Send a copy of the frame to the BPF listener */ if (priv->dev && priv->dev->if_bpf) ETHER_BPF_MTAP(priv->dev, mb); m_freem(mb); return tx_info->nr_txbb; } int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) { struct mlx4_en_priv *priv = netdev_priv(dev); int cnt = 0; /* Skip last polled descriptor */ ring->cons += ring->last_nr_txbb; en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n", ring->cons, ring->prod); if ((u32) (ring->prod - ring->cons) > ring->size) { en_warn(priv, "Tx consumer passed producer!\n"); return 0; } while (ring->cons != ring->prod) { ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, ring->cons & ring->size_mask, !!(ring->cons & ring->size), 0); ring->cons += ring->last_nr_txbb; cnt++; } if (cnt) en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt); return cnt; } static int mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cq *mcq = &cq->mcq; struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; struct mlx4_cqe *cqe; u16 index; u16 new_index, ring_index, stamp_index; u32 txbbs_skipped = 0; u32 txbbs_stamp = 0; u32 cons_index = mcq->cons_index; int size = cq->size; u32 size_mask = ring->size_mask; struct mlx4_cqe *buf = cq->buf; u32 packets = 0; u32 bytes = 0; int factor = priv->cqe_factor; u64 timestamp = 0; int done = 0; if (!priv->port_up) return 0; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; ring_index = ring->cons & size_mask; stamp_index = ring_index; /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { /* * make sure we read the CQE after we read the * ownership bit */ rmb(); if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR)) { en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n", ((struct mlx4_err_cqe *)cqe)-> vendor_err_syndrome, ((struct mlx4_err_cqe *)cqe)->syndrome); } /* Skip over last polled CQE */ new_index = be16_to_cpu(cqe->wqe_index) & size_mask; do { txbbs_skipped += ring->last_nr_txbb; ring_index = (ring_index + ring->last_nr_txbb) & size_mask; /* free next descriptor */ ring->last_nr_txbb = mlx4_en_free_tx_desc( priv, ring, ring_index, !!((ring->cons + txbbs_skipped) & ring->size), timestamp); mlx4_en_stamp_wqe(priv, ring, stamp_index, !!((ring->cons + txbbs_stamp) & ring->size)); stamp_index = ring_index; txbbs_stamp = txbbs_skipped; packets++; bytes += ring->tx_info[ring_index].nr_bytes; } while (ring_index != new_index); ++cons_index; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; } /* * To prevent CQ overflow we first update CQ consumer and only then * the ring consumer. */ mcq->cons_index = cons_index; mlx4_cq_set_ci(mcq); wmb(); ring->cons += txbbs_skipped; /* Wakeup Tx queue if it was stopped and ring is not full */ if (unlikely(ring->blocked) && (ring->prod - ring->cons) <= ring->full_size) { ring->blocked = 0; if (atomic_fetchadd_int(&priv->blocked, -1) == 1) atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE); ring->wake_queue++; priv->port_stats.wake_queue++; } return done; } void mlx4_en_tx_irq(struct mlx4_cq *mcq) { struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; if (!spin_trylock(&ring->comp_lock)) return; mlx4_en_process_tx_cq(cq->dev, cq); mod_timer(&cq->timer, jiffies + 1); spin_unlock(&ring->comp_lock); } void mlx4_en_poll_tx_cq(unsigned long data) { struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data; struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; u32 inflight; INC_PERF_COUNTER(priv->pstats.tx_poll); if (!spin_trylock(&ring->comp_lock)) { mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); return; } mlx4_en_process_tx_cq(cq->dev, cq); inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb); /* If there are still packets in flight and the timer has not already * been scheduled by the Tx routine then schedule it here to guarantee * completion processing of these packets */ if (inflight && priv->port_up) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); spin_unlock(&ring->comp_lock); } static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 index, unsigned int desc_size) { u32 copy = (ring->size - index) * TXBB_SIZE; int i; for (i = desc_size - copy - 4; i >= 0; i -= 4) { if ((i & (TXBB_SIZE - 1)) == 0) wmb(); *((u32 *) (ring->buf + i)) = *((u32 *) (ring->bounce_buf + copy + i)); } for (i = copy - 4; i >= 4 ; i -= 4) { if ((i & (TXBB_SIZE - 1)) == 0) wmb(); *((u32 *) (ring->buf + index * TXBB_SIZE + i)) = *((u32 *) (ring->bounce_buf + i)); } /* Return real descriptor location */ return ring->buf + index * TXBB_SIZE; } static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) { struct mlx4_en_cq *cq = priv->tx_cq[tx_ind]; struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ if (!timer_pending(&cq->timer)) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0) if (spin_trylock(&ring->comp_lock)) { mlx4_en_process_tx_cq(priv->dev, cq); spin_unlock(&ring->comp_lock); } } static int is_inline(struct mbuf *mb, int thold) { if (thold && mb->m_pkthdr.len <= thold && (mb->m_pkthdr.csum_flags & CSUM_TSO) == 0) return 1; return 0; } static int inline_size(struct mbuf *mb) { int len; len = mb->m_pkthdr.len; if (len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) <= MLX4_INLINE_ALIGN) return ALIGN(len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg), 16); else return ALIGN(len + CTRL_SIZE + 2 * sizeof(struct mlx4_wqe_inline_seg), 16); } static int get_head_size(struct mbuf *mb) { struct ether_vlan_header *eh; struct tcphdr *th; struct ip *ip; int ip_hlen, tcp_hlen; struct ip6_hdr *ip6; uint16_t eth_type; int eth_hdr_len; eh = mtod(mb, struct ether_vlan_header *); if (mb->m_len < ETHER_HDR_LEN) return (0); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { eth_type = ntohs(eh->evl_proto); eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); eth_hdr_len = ETHER_HDR_LEN; } if (mb->m_len < eth_hdr_len) return (0); switch (eth_type) { case ETHERTYPE_IP: ip = (struct ip *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip)) return (0); if (ip->ip_p != IPPROTO_TCP) return (0); ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip6)) return (0); if (ip6->ip6_nxt != IPPROTO_TCP) return (0); eth_hdr_len += sizeof(*ip6); break; default: return (0); } if (mb->m_len < eth_hdr_len + sizeof(*th)) return (0); th = (struct tcphdr *)(mb->m_data + eth_hdr_len); tcp_hlen = th->th_off << 2; eth_hdr_len += tcp_hlen; if (mb->m_len < eth_hdr_len) return (0); return (eth_hdr_len); } static int get_real_size(struct mbuf *mb, struct net_device *dev, int *p_n_segs, int *lso_header_size, int inl) { struct mbuf *m; int nr_segs = 0; for (m = mb; m != NULL; m = m->m_next) if (m->m_len) nr_segs++; if (mb->m_pkthdr.csum_flags & CSUM_TSO) { *lso_header_size = get_head_size(mb); if (*lso_header_size) { if (mb->m_len == *lso_header_size) nr_segs--; *p_n_segs = nr_segs; return CTRL_SIZE + nr_segs * DS_SIZE + ALIGN(*lso_header_size + 4, DS_SIZE); } } else *lso_header_size = 0; *p_n_segs = nr_segs; if (inl) return inline_size(mb); return (CTRL_SIZE + nr_segs * DS_SIZE); } static struct mbuf *mb_copy(struct mbuf *mb, int *offp, char *data, int len) { int bytes; int off; off = *offp; while (len) { bytes = min(mb->m_len - off, len); if (bytes) memcpy(data, mb->m_data + off, bytes); len -= bytes; data += bytes; off += bytes; if (off == mb->m_len) { off = 0; mb = mb->m_next; } } *offp = off; return (mb); } static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct mbuf *mb, int real_size, u16 *vlan_tag, int tx_ind) { struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; int len; int off; off = 0; len = mb->m_pkthdr.len; if (len <= spc) { inl->byte_count = cpu_to_be32(1 << 31 | (max_t(typeof(len), len, MIN_PKT_LEN))); mb_copy(mb, &off, (void *)(inl + 1), len); if (len < MIN_PKT_LEN) memset(((void *)(inl + 1)) + len, 0, MIN_PKT_LEN - len); } else { inl->byte_count = cpu_to_be32(1 << 31 | spc); mb = mb_copy(mb, &off, (void *)(inl + 1), spc); inl = (void *) (inl + 1) + spc; mb_copy(mb, &off, (void *)(inl + 1), len - spc); wmb(); inl->byte_count = cpu_to_be32(1 << 31 | (len - spc)); } tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag); tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; } static unsigned long hashrandom; static void hashrandom_init(void *arg) { hashrandom = random(); } SYSINIT(hashrandom_init, SI_SUB_KLD, SI_ORDER_SECOND, &hashrandom_init, NULL); u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb) { struct mlx4_en_priv *priv = netdev_priv(dev); u32 rings_p_up = priv->num_tx_rings_p_up; u32 vlan_tag = 0; u32 up = 0; u32 queue_index; /* Obtain VLAN information if present */ if (mb->m_flags & M_VLANTAG) { vlan_tag = mb->m_pkthdr.ether_vtag; up = (vlan_tag >> 13); } - /* hash mbuf */ - queue_index = mlx4_en_hashmbuf(MLX4_F_HASHL3 | MLX4_F_HASHL4, mb, hashrandom); + /* check if flowid is set */ + if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) + queue_index = mb->m_pkthdr.flowid; + else + queue_index = mlx4_en_hashmbuf(MLX4_F_HASHL3 | MLX4_F_HASHL4, mb, hashrandom); return ((queue_index % rings_p_up) + (up * rings_p_up)); } static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) { __iowrite64_copy(dst, src, bytecnt / 8); } static u64 mlx4_en_mac_to_u64(u8 *addr) { u64 mac = 0; int i; for (i = 0; i < ETHER_ADDR_LEN; i++) { mac <<= 8; mac |= addr[i]; } return mac; } static int mlx4_en_xmit(struct net_device *dev, int tx_ind, struct mbuf **mbp) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring; struct mlx4_en_cq *cq; struct mlx4_en_tx_desc *tx_desc; struct mlx4_wqe_data_seg *data; struct mlx4_en_tx_info *tx_info; struct mbuf *m; int nr_txbb; int nr_segs; int desc_size; int real_size; dma_addr_t dma; u32 index, bf_index, ring_size; __be32 op_own; u16 vlan_tag = 0; int i; int lso_header_size; bool bounce = false; bool inl = false; struct mbuf *mb; mb = *mbp; int defrag = 1; if (!priv->port_up) goto tx_drop; ring = priv->tx_ring[tx_ind]; ring_size = ring->size; inl = is_inline(mb, ring->inline_thold); retry: real_size = get_real_size(mb, dev, &nr_segs, &lso_header_size, inl); if (unlikely(!real_size)) goto tx_drop; /* Align descriptor to TXBB size */ desc_size = ALIGN(real_size, TXBB_SIZE); nr_txbb = desc_size / TXBB_SIZE; if (unlikely(nr_txbb > MAX_DESC_TXBBS)) { if (defrag) { mb = m_defrag(*mbp, M_NOWAIT); if (mb == NULL) { mb = *mbp; goto tx_drop; } *mbp = mb; defrag = 0; goto retry; } en_warn(priv, "Oversized header or SG list\n"); goto tx_drop; } /* Obtain VLAN information if present */ if (mb->m_flags & M_VLANTAG) { vlan_tag = mb->m_pkthdr.ether_vtag; } /* Check available TXBBs and 2K spare for prefetch * Even if netif_tx_stop_queue() will be called * driver will send current packet to ensure * that at least one completion will be issued after * stopping the queue */ if (unlikely((int)(ring->prod - ring->cons) > ring->full_size)) { /* every full Tx ring stops queue */ if (ring->blocked == 0) atomic_add_int(&priv->blocked, 1); /* Set HW-queue-is-full flag */ atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE); ring->blocked = 1; priv->port_stats.queue_stopped++; ring->queue_stopped++; /* Use interrupts to find out when queue opened */ cq = priv->tx_cq[tx_ind]; mlx4_en_arm_cq(priv, cq); return EBUSY; } /* Track current inflight packets for performance analysis */ AVG_PERF_COUNTER(priv->pstats.inflight_avg, (u32) (ring->prod - ring->cons - 1)); /* Packet is good - grab an index and transmit it */ index = ring->prod & ring->size_mask; bf_index = ring->prod; /* See if we have enough space for whole descriptor TXBB for setting * SW ownership on next descriptor; if not, use a bounce buffer. */ if (likely(index + nr_txbb <= ring_size)) tx_desc = ring->buf + index * TXBB_SIZE; else { tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf; bounce = true; } /* Save mb in tx_info ring */ tx_info = &ring->tx_info[index]; tx_info->mb = mb; tx_info->nr_txbb = nr_txbb; tx_info->nr_segs = nr_segs; if (lso_header_size) { memcpy(tx_desc->lso.header, mb->m_data, lso_header_size); data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, DS_SIZE)); /* lso header is part of m_data. * need to omit when mapping DMA */ mb->m_data += lso_header_size; mb->m_len -= lso_header_size; } else data = &tx_desc->data; /* valid only for none inline segments */ tx_info->data_offset = (void *)data - (void *)tx_desc; if (inl) { tx_info->inl = 1; } else { for (i = 0, m = mb; i < nr_segs; i++, m = m->m_next) { if (m->m_len == 0) { i--; continue; } dma = pci_map_single(mdev->dev->pdev, m->m_data, m->m_len, PCI_DMA_TODEVICE); data->addr = cpu_to_be64(dma); data->lkey = cpu_to_be32(mdev->mr.key); wmb(); data->byte_count = cpu_to_be32(m->m_len); data++; } if (lso_header_size) { mb->m_data -= lso_header_size; mb->m_len += lso_header_size; } tx_info->inl = 0; } /* Prepare ctrl segement apart opcode+ownership, which depends on * whether LSO is used */ tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!vlan_tag; tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO | CSUM_TCP | CSUM_UDP | CSUM_TCP_IPV6 | CSUM_UDP_IPV6)) { if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); priv->port_stats.tx_chksum_offload++; ring->tx_csum++; } if (unlikely(priv->validate_loopback)) { /* Copy dst mac address to wqe */ struct ether_header *ethh; u64 mac; u32 mac_l, mac_h; ethh = mtod(mb, struct ether_header *); mac = mlx4_en_mac_to_u64(ethh->ether_dhost); if (mac) { mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16); mac_l = (u32) (mac & 0xffffffff); tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h); tx_desc->ctrl.imm = cpu_to_be32(mac_l); } } /* Handle LSO (TSO) packets */ if (lso_header_size) { int segsz; /* Mark opcode as LSO */ op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) | ((ring->prod & ring_size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); /* Fill in the LSO prefix */ tx_desc->lso.mss_hdr_size = cpu_to_be32( mb->m_pkthdr.tso_segsz << 16 | lso_header_size); priv->port_stats.tso_packets++; segsz = mb->m_pkthdr.tso_segsz; i = ((mb->m_pkthdr.len - lso_header_size + segsz - 1) / segsz); tx_info->nr_bytes= mb->m_pkthdr.len + (i - 1) * lso_header_size; ring->packets += i; } else { /* Normal (Non LSO) packet */ op_own = cpu_to_be32(MLX4_OPCODE_SEND) | ((ring->prod & ring_size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); tx_info->nr_bytes = max(mb->m_pkthdr.len, (unsigned int)ETHER_MIN_LEN - ETHER_CRC_LEN); ring->packets++; } ring->bytes += tx_info->nr_bytes; AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len); if (tx_info->inl) { build_inline_wqe(tx_desc, mb, real_size, &vlan_tag, tx_ind); tx_info->inl = 1; } ring->prod += nr_txbb; /* If we used a bounce buffer then copy descriptor back into place */ if (unlikely(bounce)) tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) { *(__be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn); op_own |= htonl((bf_index & 0xffff) << 8); /* Ensure new descirptor hits memory * before setting ownership of this descriptor to HW */ wmb(); tx_desc->ctrl.owner_opcode = op_own; wmb(); mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl, desc_size); wmb(); ring->bf.offset ^= ring->bf.buf_size; } else { /* Ensure new descirptor hits memory * before setting ownership of this descriptor to HW */ wmb(); tx_desc->ctrl.owner_opcode = op_own; wmb(); writel(cpu_to_be32(ring->doorbell_qpn), ring->bf.uar->map + MLX4_SEND_DOORBELL); } return 0; tx_drop: *mbp = NULL; m_freem(mb); return EINVAL; } static int mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mbuf *next; int enqueued, err = 0; ring = priv->tx_ring[tx_ind]; if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || priv->port_up == 0) { if (m != NULL) err = drbr_enqueue(dev, ring->br, m); return (err); } enqueued = 0; if (m != NULL) { if ((err = drbr_enqueue(dev, ring->br, m)) != 0) return (err); } /* Process the queue */ while ((next = drbr_peek(dev, ring->br)) != NULL) { if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) { if (next == NULL) { drbr_advance(dev, ring->br); } else { drbr_putback(dev, ring->br, next); } break; } drbr_advance(dev, ring->br); enqueued++; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (enqueued > 0) ring->watchdog_time = ticks; return (err); } void mlx4_en_tx_que(void *context, int pending) { struct mlx4_en_tx_ring *ring; struct mlx4_en_priv *priv; struct net_device *dev; struct mlx4_en_cq *cq; int tx_ind; cq = context; dev = cq->dev; priv = dev->if_softc; tx_ind = cq->ring; ring = priv->tx_ring[tx_ind]; if (dev->if_drv_flags & IFF_DRV_RUNNING) { mlx4_en_xmit_poll(priv, tx_ind); spin_lock(&ring->tx_lock); if (!drbr_empty(dev, ring->br)) mlx4_en_transmit_locked(dev, tx_ind, NULL); spin_unlock(&ring->tx_lock); } } int mlx4_en_transmit(struct ifnet *dev, struct mbuf *m) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mlx4_en_cq *cq; - int i = 0, err = 0; + int i, err = 0; - /* Which queue to use */ - if ((m->m_flags & (M_FLOWID | M_VLANTAG)) == M_FLOWID) { - i = m->m_pkthdr.flowid % (priv->tx_ring_num - 1); - } - else { - i = mlx4_en_select_queue(dev, m); - } + /* Compute which queue to use */ + i = mlx4_en_select_queue(dev, m); + ring = priv->tx_ring[i]; if (spin_trylock(&ring->tx_lock)) { err = mlx4_en_transmit_locked(dev, i, m); spin_unlock(&ring->tx_lock); /* Poll CQ here */ mlx4_en_xmit_poll(priv, i); } else { err = drbr_enqueue(dev, ring->br, m); cq = priv->tx_cq[i]; taskqueue_enqueue(cq->tq, &cq->cq_task); } return (err); } /* * Flush ring buffers. */ void mlx4_en_qflush(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mbuf *m; for (int i = 0; i < priv->tx_ring_num; i++) { ring = priv->tx_ring[i]; spin_lock(&ring->tx_lock); while ((m = buf_ring_dequeue_sc(ring->br)) != NULL) m_freem(m); spin_unlock(&ring->tx_lock); } if_qflush(dev); } Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 275357) +++ head/sys/sys/param.h (revision 275358) @@ -1,349 +1,349 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/book.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1100047 /* Master, propagated to newvers */ +#define __FreeBSD_version 1100048 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #ifdef _KERNEL #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_MAP_RENAME 1100039 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #define BKVASIZE 16384 /* must be power of 2 */ #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef lint #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* lint */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */