Index: head/sys/dev/ixl/if_iavf.c =================================================================== --- head/sys/dev/ixl/if_iavf.c (revision 357663) +++ head/sys/dev/ixl/if_iavf.c (revision 357664) @@ -1,2424 +1,2424 @@ /****************************************************************************** Copyright (c) 2013-2018, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "iavf.h" /********************************************************************* * Driver version *********************************************************************/ #define IAVF_DRIVER_VERSION_MAJOR 2 #define IAVF_DRIVER_VERSION_MINOR 0 #define IAVF_DRIVER_VERSION_BUILD 0 #define IAVF_DRIVER_VERSION_STRING \ __XSTRING(IAVF_DRIVER_VERSION_MAJOR) "." \ __XSTRING(IAVF_DRIVER_VERSION_MINOR) "." \ __XSTRING(IAVF_DRIVER_VERSION_BUILD) "-k" /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * * ( Vendor ID, Device ID, Branding String ) *********************************************************************/ static pci_vendor_info_t iavf_vendor_info_array[] = { PVID(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_VF, "Intel(R) Ethernet Virtual Function 700 Series"), PVID(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_X722_VF, "Intel(R) Ethernet Virtual Function 700 Series (X722)"), PVID(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_ADAPTIVE_VF, "Intel(R) Ethernet Adaptive Virtual Function"), /* required last entry */ PVID_END }; /********************************************************************* * Function prototypes *********************************************************************/ static void *iavf_register(device_t dev); static int iavf_if_attach_pre(if_ctx_t ctx); static int iavf_if_attach_post(if_ctx_t ctx); static int iavf_if_detach(if_ctx_t ctx); static int iavf_if_shutdown(if_ctx_t ctx); static int iavf_if_suspend(if_ctx_t ctx); static int iavf_if_resume(if_ctx_t ctx); static int iavf_if_msix_intr_assign(if_ctx_t ctx, int msix); static void iavf_if_enable_intr(if_ctx_t ctx); static void iavf_if_disable_intr(if_ctx_t ctx); static int iavf_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid); static int iavf_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid); static int iavf_if_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int ntxqs, int ntxqsets); static int iavf_if_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int nqs, int nqsets); static void iavf_if_queues_free(if_ctx_t ctx); static void iavf_if_update_admin_status(if_ctx_t ctx); static void iavf_if_multi_set(if_ctx_t ctx); static int iavf_if_mtu_set(if_ctx_t ctx, uint32_t mtu); static void iavf_if_media_status(if_ctx_t ctx, struct ifmediareq *ifmr); static int iavf_if_media_change(if_ctx_t ctx); static int iavf_if_promisc_set(if_ctx_t ctx, int flags); static void iavf_if_timer(if_ctx_t ctx, uint16_t qid); static void iavf_if_vlan_register(if_ctx_t ctx, u16 vtag); static void iavf_if_vlan_unregister(if_ctx_t ctx, u16 vtag); static uint64_t iavf_if_get_counter(if_ctx_t ctx, ift_counter cnt); static void iavf_if_stop(if_ctx_t ctx); static int iavf_allocate_pci_resources(struct iavf_sc *); static int iavf_reset_complete(struct i40e_hw *); static int iavf_setup_vc(struct iavf_sc *); static int iavf_reset(struct iavf_sc *); static int iavf_vf_config(struct iavf_sc *); static void iavf_init_filters(struct iavf_sc *); static void iavf_free_pci_resources(struct iavf_sc *); static void iavf_free_filters(struct iavf_sc *); static void iavf_setup_interface(device_t, struct iavf_sc *); static void iavf_add_device_sysctls(struct iavf_sc *); static void iavf_enable_adminq_irq(struct i40e_hw *); static void iavf_disable_adminq_irq(struct i40e_hw *); static void iavf_enable_queue_irq(struct i40e_hw *, int); static void iavf_disable_queue_irq(struct i40e_hw *, int); static void iavf_config_rss(struct iavf_sc *); static void iavf_stop(struct iavf_sc *); static int iavf_add_mac_filter(struct iavf_sc *, u8 *, u16); static int iavf_del_mac_filter(struct iavf_sc *sc, u8 *macaddr); static int iavf_msix_que(void *); static int iavf_msix_adminq(void *); //static void iavf_del_multi(struct iavf_sc *sc); static void iavf_init_multi(struct iavf_sc *sc); static void iavf_configure_itr(struct iavf_sc *sc); static int iavf_sysctl_rx_itr(SYSCTL_HANDLER_ARGS); static int iavf_sysctl_tx_itr(SYSCTL_HANDLER_ARGS); static int iavf_sysctl_current_speed(SYSCTL_HANDLER_ARGS); static int iavf_sysctl_sw_filter_list(SYSCTL_HANDLER_ARGS); static int iavf_sysctl_queue_interrupt_table(SYSCTL_HANDLER_ARGS); static int iavf_sysctl_vf_reset(SYSCTL_HANDLER_ARGS); static int iavf_sysctl_vflr_reset(SYSCTL_HANDLER_ARGS); static void iavf_save_tunables(struct iavf_sc *); static enum i40e_status_code iavf_process_adminq(struct iavf_sc *, u16 *); static int iavf_send_vc_msg(struct iavf_sc *sc, u32 op); static int iavf_send_vc_msg_sleep(struct iavf_sc *sc, u32 op); /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t iavf_methods[] = { /* Device interface */ DEVMETHOD(device_register, iavf_register), DEVMETHOD(device_probe, iflib_device_probe), DEVMETHOD(device_attach, iflib_device_attach), DEVMETHOD(device_detach, iflib_device_detach), DEVMETHOD(device_shutdown, iflib_device_shutdown), DEVMETHOD_END }; static driver_t iavf_driver = { "iavf", iavf_methods, sizeof(struct iavf_sc), }; devclass_t iavf_devclass; DRIVER_MODULE(iavf, pci, iavf_driver, iavf_devclass, 0, 0); MODULE_PNP_INFO("U32:vendor;U32:device;U32:subvendor;U32:subdevice;U32:revision", pci, iavf, iavf_vendor_info_array, nitems(iavf_vendor_info_array) - 1); MODULE_VERSION(iavf, 1); MODULE_DEPEND(iavf, pci, 1, 1, 1); MODULE_DEPEND(iavf, ether, 1, 1, 1); MODULE_DEPEND(iavf, iflib, 1, 1, 1); MALLOC_DEFINE(M_IAVF, "iavf", "iavf driver allocations"); static device_method_t iavf_if_methods[] = { DEVMETHOD(ifdi_attach_pre, iavf_if_attach_pre), DEVMETHOD(ifdi_attach_post, iavf_if_attach_post), DEVMETHOD(ifdi_detach, iavf_if_detach), DEVMETHOD(ifdi_shutdown, iavf_if_shutdown), DEVMETHOD(ifdi_suspend, iavf_if_suspend), DEVMETHOD(ifdi_resume, iavf_if_resume), DEVMETHOD(ifdi_init, iavf_if_init), DEVMETHOD(ifdi_stop, iavf_if_stop), DEVMETHOD(ifdi_msix_intr_assign, iavf_if_msix_intr_assign), DEVMETHOD(ifdi_intr_enable, iavf_if_enable_intr), DEVMETHOD(ifdi_intr_disable, iavf_if_disable_intr), DEVMETHOD(ifdi_rx_queue_intr_enable, iavf_if_rx_queue_intr_enable), DEVMETHOD(ifdi_tx_queue_intr_enable, iavf_if_tx_queue_intr_enable), DEVMETHOD(ifdi_tx_queues_alloc, iavf_if_tx_queues_alloc), DEVMETHOD(ifdi_rx_queues_alloc, iavf_if_rx_queues_alloc), DEVMETHOD(ifdi_queues_free, iavf_if_queues_free), DEVMETHOD(ifdi_update_admin_status, iavf_if_update_admin_status), DEVMETHOD(ifdi_multi_set, iavf_if_multi_set), DEVMETHOD(ifdi_mtu_set, iavf_if_mtu_set), DEVMETHOD(ifdi_media_status, iavf_if_media_status), DEVMETHOD(ifdi_media_change, iavf_if_media_change), DEVMETHOD(ifdi_promisc_set, iavf_if_promisc_set), DEVMETHOD(ifdi_timer, iavf_if_timer), DEVMETHOD(ifdi_vlan_register, iavf_if_vlan_register), DEVMETHOD(ifdi_vlan_unregister, iavf_if_vlan_unregister), DEVMETHOD(ifdi_get_counter, iavf_if_get_counter), DEVMETHOD_END }; static driver_t iavf_if_driver = { "iavf_if", iavf_if_methods, sizeof(struct iavf_sc) }; /* ** TUNEABLE PARAMETERS: */ static SYSCTL_NODE(_hw, OID_AUTO, iavf, CTLFLAG_RD, 0, "iavf driver parameters"); /* * Different method for processing TX descriptor * completion. */ static int iavf_enable_head_writeback = 0; TUNABLE_INT("hw.iavf.enable_head_writeback", &iavf_enable_head_writeback); SYSCTL_INT(_hw_iavf, OID_AUTO, enable_head_writeback, CTLFLAG_RDTUN, &iavf_enable_head_writeback, 0, "For detecting last completed TX descriptor by hardware, use value written by HW instead of checking descriptors"); static int iavf_core_debug_mask = 0; TUNABLE_INT("hw.iavf.core_debug_mask", &iavf_core_debug_mask); SYSCTL_INT(_hw_iavf, OID_AUTO, core_debug_mask, CTLFLAG_RDTUN, &iavf_core_debug_mask, 0, "Display debug statements that are printed in non-shared code"); static int iavf_shared_debug_mask = 0; TUNABLE_INT("hw.iavf.shared_debug_mask", &iavf_shared_debug_mask); SYSCTL_INT(_hw_iavf, OID_AUTO, shared_debug_mask, CTLFLAG_RDTUN, &iavf_shared_debug_mask, 0, "Display debug statements that are printed in shared code"); int iavf_rx_itr = IXL_ITR_8K; TUNABLE_INT("hw.iavf.rx_itr", &iavf_rx_itr); SYSCTL_INT(_hw_iavf, OID_AUTO, rx_itr, CTLFLAG_RDTUN, &iavf_rx_itr, 0, "RX Interrupt Rate"); int iavf_tx_itr = IXL_ITR_4K; TUNABLE_INT("hw.iavf.tx_itr", &iavf_tx_itr); SYSCTL_INT(_hw_iavf, OID_AUTO, tx_itr, CTLFLAG_RDTUN, &iavf_tx_itr, 0, "TX Interrupt Rate"); extern struct if_txrx ixl_txrx_hwb; extern struct if_txrx ixl_txrx_dwb; static struct if_shared_ctx iavf_sctx_init = { .isc_magic = IFLIB_MAGIC, .isc_q_align = PAGE_SIZE,/* max(DBA_ALIGN, PAGE_SIZE) */ .isc_tx_maxsize = IXL_TSO_SIZE + sizeof(struct ether_vlan_header), .isc_tx_maxsegsize = IXL_MAX_DMA_SEG_SIZE, .isc_tso_maxsize = IXL_TSO_SIZE + sizeof(struct ether_vlan_header), .isc_tso_maxsegsize = IXL_MAX_DMA_SEG_SIZE, .isc_rx_maxsize = 16384, .isc_rx_nsegments = IXL_MAX_RX_SEGS, .isc_rx_maxsegsize = IXL_MAX_DMA_SEG_SIZE, .isc_nfl = 1, .isc_ntxqs = 1, .isc_nrxqs = 1, .isc_admin_intrcnt = 1, .isc_vendor_info = iavf_vendor_info_array, .isc_driver_version = IAVF_DRIVER_VERSION_STRING, .isc_driver = &iavf_if_driver, .isc_flags = IFLIB_NEED_SCRATCH | IFLIB_NEED_ZERO_CSUM | IFLIB_TSO_INIT_IP | IFLIB_IS_VF, .isc_nrxd_min = {IXL_MIN_RING}, .isc_ntxd_min = {IXL_MIN_RING}, .isc_nrxd_max = {IXL_MAX_RING}, .isc_ntxd_max = {IXL_MAX_RING}, .isc_nrxd_default = {IXL_DEFAULT_RING}, .isc_ntxd_default = {IXL_DEFAULT_RING}, }; if_shared_ctx_t iavf_sctx = &iavf_sctx_init; /*** Functions ***/ static void * iavf_register(device_t dev) { return (iavf_sctx); } static int iavf_allocate_pci_resources(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; device_t dev = iflib_get_dev(sc->vsi.ctx); int rid; /* Map BAR0 */ rid = PCIR_BAR(0); sc->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!(sc->pci_mem)) { device_printf(dev, "Unable to allocate bus resource: PCI memory\n"); return (ENXIO); } /* Save off the PCI information */ hw->vendor_id = pci_get_vendor(dev); hw->device_id = pci_get_device(dev); hw->revision_id = pci_read_config(dev, PCIR_REVID, 1); hw->subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); hw->subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); hw->bus.device = pci_get_slot(dev); hw->bus.func = pci_get_function(dev); /* Save off register access information */ sc->osdep.mem_bus_space_tag = rman_get_bustag(sc->pci_mem); sc->osdep.mem_bus_space_handle = rman_get_bushandle(sc->pci_mem); sc->osdep.mem_bus_space_size = rman_get_size(sc->pci_mem); sc->osdep.flush_reg = I40E_VFGEN_RSTAT; sc->osdep.dev = dev; sc->hw.hw_addr = (u8 *) &sc->osdep.mem_bus_space_handle; sc->hw.back = &sc->osdep; return (0); } static int iavf_if_attach_pre(if_ctx_t ctx) { device_t dev; struct iavf_sc *sc; struct i40e_hw *hw; struct ixl_vsi *vsi; if_softc_ctx_t scctx; int error = 0; dev = iflib_get_dev(ctx); sc = iflib_get_softc(ctx); vsi = &sc->vsi; vsi->back = sc; sc->dev = dev; hw = &sc->hw; vsi->dev = dev; vsi->hw = &sc->hw; vsi->num_vlans = 0; vsi->ctx = ctx; vsi->media = iflib_get_media(ctx); vsi->shared = scctx = iflib_get_softc_ctx(ctx); iavf_save_tunables(sc); /* Do PCI setup - map BAR0, etc */ if (iavf_allocate_pci_resources(sc)) { device_printf(dev, "%s: Allocation of PCI resources failed\n", __func__); error = ENXIO; goto err_early; } iavf_dbg_init(sc, "Allocated PCI resources and MSI-X vectors\n"); /* * XXX: This is called by init_shared_code in the PF driver, * but the rest of that function does not support VFs. */ error = i40e_set_mac_type(hw); if (error) { device_printf(dev, "%s: set_mac_type failed: %d\n", __func__, error); goto err_pci_res; } error = iavf_reset_complete(hw); if (error) { device_printf(dev, "%s: Device is still being reset\n", __func__); goto err_pci_res; } iavf_dbg_init(sc, "VF Device is ready for configuration\n"); /* Sets up Admin Queue */ error = iavf_setup_vc(sc); if (error) { device_printf(dev, "%s: Error setting up PF comms, %d\n", __func__, error); goto err_pci_res; } iavf_dbg_init(sc, "PF API version verified\n"); /* Need API version before sending reset message */ error = iavf_reset(sc); if (error) { device_printf(dev, "VF reset failed; reload the driver\n"); goto err_aq; } iavf_dbg_init(sc, "VF reset complete\n"); /* Ask for VF config from PF */ error = iavf_vf_config(sc); if (error) { device_printf(dev, "Error getting configuration from PF: %d\n", error); goto err_aq; } device_printf(dev, "VSIs %d, QPs %d, MSI-X %d, RSS sizes: key %d lut %d\n", sc->vf_res->num_vsis, sc->vf_res->num_queue_pairs, sc->vf_res->max_vectors, sc->vf_res->rss_key_size, sc->vf_res->rss_lut_size); iavf_dbg_info(sc, "Capabilities=%b\n", sc->vf_res->vf_cap_flags, IAVF_PRINTF_VF_OFFLOAD_FLAGS); /* got VF config message back from PF, now we can parse it */ for (int i = 0; i < sc->vf_res->num_vsis; i++) { /* XXX: We only use the first VSI we find */ if (sc->vf_res->vsi_res[i].vsi_type == I40E_VSI_SRIOV) sc->vsi_res = &sc->vf_res->vsi_res[i]; } if (!sc->vsi_res) { device_printf(dev, "%s: no LAN VSI found\n", __func__); error = EIO; goto err_res_buf; } vsi->id = sc->vsi_res->vsi_id; iavf_dbg_init(sc, "Resource Acquisition complete\n"); /* If no mac address was assigned just make a random one */ if (!iavf_check_ether_addr(hw->mac.addr)) { u8 addr[ETHER_ADDR_LEN]; arc4rand(&addr, sizeof(addr), 0); addr[0] &= 0xFE; addr[0] |= 0x02; bcopy(addr, hw->mac.addr, sizeof(addr)); } bcopy(hw->mac.addr, hw->mac.perm_addr, ETHER_ADDR_LEN); iflib_set_mac(ctx, hw->mac.addr); /* Allocate filter lists */ iavf_init_filters(sc); /* Fill out more iflib parameters */ scctx->isc_ntxqsets_max = scctx->isc_nrxqsets_max = sc->vsi_res->num_queue_pairs; if (vsi->enable_head_writeback) { scctx->isc_txqsizes[0] = roundup2(scctx->isc_ntxd[0] * sizeof(struct i40e_tx_desc) + sizeof(u32), DBA_ALIGN); scctx->isc_txrx = &ixl_txrx_hwb; } else { scctx->isc_txqsizes[0] = roundup2(scctx->isc_ntxd[0] * sizeof(struct i40e_tx_desc), DBA_ALIGN); scctx->isc_txrx = &ixl_txrx_dwb; } scctx->isc_rxqsizes[0] = roundup2(scctx->isc_nrxd[0] * sizeof(union i40e_32byte_rx_desc), DBA_ALIGN); scctx->isc_msix_bar = PCIR_BAR(IXL_MSIX_BAR); scctx->isc_tx_nsegments = IXL_MAX_TX_SEGS; scctx->isc_tx_tso_segments_max = IXL_MAX_TSO_SEGS; scctx->isc_tx_tso_size_max = IXL_TSO_SIZE; scctx->isc_tx_tso_segsize_max = IXL_MAX_DMA_SEG_SIZE; scctx->isc_rss_table_size = IXL_RSS_VSI_LUT_SIZE; scctx->isc_tx_csum_flags = CSUM_OFFLOAD; scctx->isc_capabilities = scctx->isc_capenable = IXL_CAPS; return (0); err_res_buf: free(sc->vf_res, M_IAVF); err_aq: i40e_shutdown_adminq(hw); err_pci_res: iavf_free_pci_resources(sc); err_early: return (error); } static int iavf_if_attach_post(if_ctx_t ctx) { device_t dev; struct iavf_sc *sc; struct i40e_hw *hw; struct ixl_vsi *vsi; int error = 0; INIT_DBG_DEV(dev, "begin"); dev = iflib_get_dev(ctx); sc = iflib_get_softc(ctx); vsi = &sc->vsi; vsi->ifp = iflib_get_ifp(ctx); hw = &sc->hw; /* Save off determined number of queues for interface */ vsi->num_rx_queues = vsi->shared->isc_nrxqsets; vsi->num_tx_queues = vsi->shared->isc_ntxqsets; /* Setup the stack interface */ iavf_setup_interface(dev, sc); INIT_DBG_DEV(dev, "Interface setup complete"); /* Initialize statistics & add sysctls */ bzero(&sc->vsi.eth_stats, sizeof(struct i40e_eth_stats)); iavf_add_device_sysctls(sc); sc->init_state = IAVF_INIT_READY; atomic_store_rel_32(&sc->queues_enabled, 0); /* We want AQ enabled early for init */ iavf_enable_adminq_irq(hw); INIT_DBG_DEV(dev, "end"); return (error); } /** * XXX: iflib always ignores the return value of detach() * -> This means that this isn't allowed to fail */ static int iavf_if_detach(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct i40e_hw *hw = &sc->hw; device_t dev = sc->dev; enum i40e_status_code status; INIT_DBG_DEV(dev, "begin"); /* Remove all the media and link information */ ifmedia_removeall(vsi->media); iavf_disable_adminq_irq(hw); status = i40e_shutdown_adminq(&sc->hw); if (status != I40E_SUCCESS) { device_printf(dev, "i40e_shutdown_adminq() failed with status %s\n", i40e_stat_str(hw, status)); } free(sc->vf_res, M_IAVF); iavf_free_pci_resources(sc); iavf_free_filters(sc); INIT_DBG_DEV(dev, "end"); return (0); } static int iavf_if_shutdown(if_ctx_t ctx) { return (0); } static int iavf_if_suspend(if_ctx_t ctx) { return (0); } static int iavf_if_resume(if_ctx_t ctx) { return (0); } static int iavf_send_vc_msg_sleep(struct iavf_sc *sc, u32 op) { int error = 0; if_ctx_t ctx = sc->vsi.ctx; error = ixl_vc_send_cmd(sc, op); if (error != 0) { iavf_dbg_vc(sc, "Error sending %b: %d\n", op, IAVF_FLAGS, error); return (error); } /* Don't wait for a response if the device is being detached. */ if (!iflib_in_detach(ctx)) { iavf_dbg_vc(sc, "Sleeping for op %b\n", op, IAVF_FLAGS); error = sx_sleep(ixl_vc_get_op_chan(sc, op), iflib_ctx_lock_get(ctx), PRI_MAX, "iavf_vc", IAVF_AQ_TIMEOUT); if (error == EWOULDBLOCK) device_printf(sc->dev, "%b timed out\n", op, IAVF_FLAGS); } return (error); } static int iavf_send_vc_msg(struct iavf_sc *sc, u32 op) { int error = 0; error = ixl_vc_send_cmd(sc, op); if (error != 0) iavf_dbg_vc(sc, "Error sending %b: %d\n", op, IAVF_FLAGS, error); return (error); } static void iavf_init_queues(struct ixl_vsi *vsi) { struct ixl_tx_queue *tx_que = vsi->tx_queues; struct ixl_rx_queue *rx_que = vsi->rx_queues; struct rx_ring *rxr; for (int i = 0; i < vsi->num_tx_queues; i++, tx_que++) ixl_init_tx_ring(vsi, tx_que); for (int i = 0; i < vsi->num_rx_queues; i++, rx_que++) { rxr = &rx_que->rxr; rxr->mbuf_sz = iflib_get_rx_mbuf_sz(vsi->ctx); wr32(vsi->hw, rxr->tail, 0); } } void iavf_if_init(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct i40e_hw *hw = &sc->hw; struct ifnet *ifp = iflib_get_ifp(ctx); u8 tmpaddr[ETHER_ADDR_LEN]; int error = 0; INIT_DBG_IF(ifp, "begin"); MPASS(sx_xlocked(iflib_ctx_lock_get(ctx))); error = iavf_reset_complete(hw); if (error) { device_printf(sc->dev, "%s: VF reset failed\n", __func__); } if (!i40e_check_asq_alive(hw)) { iavf_dbg_info(sc, "ASQ is not alive, re-initializing AQ\n"); pci_enable_busmaster(sc->dev); i40e_shutdown_adminq(hw); i40e_init_adminq(hw); } /* Make sure queues are disabled */ iavf_send_vc_msg(sc, IAVF_FLAG_AQ_DISABLE_QUEUES); bcopy(IF_LLADDR(ifp), tmpaddr, ETHER_ADDR_LEN); if (!cmp_etheraddr(hw->mac.addr, tmpaddr) && (i40e_validate_mac_addr(tmpaddr) == I40E_SUCCESS)) { error = iavf_del_mac_filter(sc, hw->mac.addr); if (error == 0) iavf_send_vc_msg(sc, IAVF_FLAG_AQ_DEL_MAC_FILTER); bcopy(tmpaddr, hw->mac.addr, ETH_ALEN); } error = iavf_add_mac_filter(sc, hw->mac.addr, 0); if (!error || error == EEXIST) iavf_send_vc_msg(sc, IAVF_FLAG_AQ_ADD_MAC_FILTER); iflib_set_mac(ctx, hw->mac.addr); /* Prepare the queues for operation */ iavf_init_queues(vsi); /* Set initial ITR values */ iavf_configure_itr(sc); iavf_send_vc_msg(sc, IAVF_FLAG_AQ_CONFIGURE_QUEUES); /* Set up RSS */ iavf_config_rss(sc); /* Map vectors */ iavf_send_vc_msg(sc, IAVF_FLAG_AQ_MAP_VECTORS); /* Init SW TX ring indices */ if (vsi->enable_head_writeback) ixl_init_tx_cidx(vsi); else ixl_init_tx_rsqs(vsi); /* Configure promiscuous mode */ iavf_if_promisc_set(ctx, if_getflags(ifp)); /* Enable queues */ iavf_send_vc_msg_sleep(sc, IAVF_FLAG_AQ_ENABLE_QUEUES); sc->init_state = IAVF_RUNNING; } /* - * iavf_attach() helper function; initalizes the admin queue + * iavf_attach() helper function; initializes the admin queue * and attempts to establish contact with the PF by * retrying the initial "API version" message several times * or until the PF responds. */ static int iavf_setup_vc(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; device_t dev = sc->dev; int error = 0, ret_error = 0, asq_retries = 0; bool send_api_ver_retried = 0; /* Need to set these AQ paramters before initializing AQ */ hw->aq.num_arq_entries = IXL_AQ_LEN; hw->aq.num_asq_entries = IXL_AQ_LEN; hw->aq.arq_buf_size = IXL_AQ_BUF_SZ; hw->aq.asq_buf_size = IXL_AQ_BUF_SZ; for (int i = 0; i < IAVF_AQ_MAX_ERR; i++) { /* Initialize admin queue */ error = i40e_init_adminq(hw); if (error) { device_printf(dev, "%s: init_adminq failed: %d\n", __func__, error); ret_error = 1; continue; } iavf_dbg_init(sc, "Initialized Admin Queue; starting" " send_api_ver attempt %d", i+1); retry_send: /* Send VF's API version */ error = iavf_send_api_ver(sc); if (error) { i40e_shutdown_adminq(hw); ret_error = 2; device_printf(dev, "%s: unable to send api" " version to PF on attempt %d, error %d\n", __func__, i+1, error); } asq_retries = 0; while (!i40e_asq_done(hw)) { if (++asq_retries > IAVF_AQ_MAX_ERR) { i40e_shutdown_adminq(hw); device_printf(dev, "Admin Queue timeout " "(waiting for send_api_ver), %d more tries...\n", IAVF_AQ_MAX_ERR - (i + 1)); ret_error = 3; break; } i40e_msec_pause(10); } if (asq_retries > IAVF_AQ_MAX_ERR) continue; iavf_dbg_init(sc, "Sent API version message to PF"); /* Verify that the VF accepts the PF's API version */ error = iavf_verify_api_ver(sc); if (error == ETIMEDOUT) { if (!send_api_ver_retried) { /* Resend message, one more time */ send_api_ver_retried = true; device_printf(dev, "%s: Timeout while verifying API version on first" " try!\n", __func__); goto retry_send; } else { device_printf(dev, "%s: Timeout while verifying API version on second" " try!\n", __func__); ret_error = 4; break; } } if (error) { device_printf(dev, "%s: Unable to verify API version," " error %s\n", __func__, i40e_stat_str(hw, error)); ret_error = 5; } break; } if (ret_error >= 4) i40e_shutdown_adminq(hw); return (ret_error); } /* * iavf_attach() helper function; asks the PF for this VF's * configuration, and saves the information if it receives it. */ static int iavf_vf_config(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; device_t dev = sc->dev; int bufsz, error = 0, ret_error = 0; int asq_retries, retried = 0; retry_config: error = iavf_send_vf_config_msg(sc); if (error) { device_printf(dev, "%s: Unable to send VF config request, attempt %d," " error %d\n", __func__, retried + 1, error); ret_error = 2; } asq_retries = 0; while (!i40e_asq_done(hw)) { if (++asq_retries > IAVF_AQ_MAX_ERR) { device_printf(dev, "%s: Admin Queue timeout " "(waiting for send_vf_config_msg), attempt %d\n", __func__, retried + 1); ret_error = 3; goto fail; } i40e_msec_pause(10); } iavf_dbg_init(sc, "Sent VF config message to PF, attempt %d\n", retried + 1); if (!sc->vf_res) { bufsz = sizeof(struct virtchnl_vf_resource) + (I40E_MAX_VF_VSI * sizeof(struct virtchnl_vsi_resource)); sc->vf_res = malloc(bufsz, M_IAVF, M_NOWAIT); if (!sc->vf_res) { device_printf(dev, "%s: Unable to allocate memory for VF configuration" " message from PF on attempt %d\n", __func__, retried + 1); ret_error = 1; goto fail; } } /* Check for VF config response */ error = iavf_get_vf_config(sc); if (error == ETIMEDOUT) { /* The 1st time we timeout, send the configuration message again */ if (!retried) { retried++; goto retry_config; } device_printf(dev, "%s: iavf_get_vf_config() timed out waiting for a response\n", __func__); } if (error) { device_printf(dev, "%s: Unable to get VF configuration from PF after %d tries!\n", __func__, retried + 1); ret_error = 4; } goto done; fail: free(sc->vf_res, M_IAVF); done: return (ret_error); } static int iavf_if_msix_intr_assign(if_ctx_t ctx, int msix) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct ixl_rx_queue *rx_que = vsi->rx_queues; struct ixl_tx_queue *tx_que = vsi->tx_queues; int err, i, rid, vector = 0; char buf[16]; MPASS(vsi->shared->isc_nrxqsets > 0); MPASS(vsi->shared->isc_ntxqsets > 0); /* Admin Que is vector 0*/ rid = vector + 1; err = iflib_irq_alloc_generic(ctx, &vsi->irq, rid, IFLIB_INTR_ADMIN, iavf_msix_adminq, sc, 0, "aq"); if (err) { iflib_irq_free(ctx, &vsi->irq); device_printf(iflib_get_dev(ctx), "Failed to register Admin Que handler"); return (err); } /* Now set up the stations */ for (i = 0, vector = 1; i < vsi->shared->isc_nrxqsets; i++, vector++, rx_que++) { rid = vector + 1; snprintf(buf, sizeof(buf), "rxq%d", i); err = iflib_irq_alloc_generic(ctx, &rx_que->que_irq, rid, IFLIB_INTR_RX, iavf_msix_que, rx_que, rx_que->rxr.me, buf); /* XXX: Does the driver work as expected if there are fewer num_rx_queues than * what's expected in the iflib context? */ if (err) { device_printf(iflib_get_dev(ctx), "Failed to allocate queue RX int vector %d, err: %d\n", i, err); vsi->num_rx_queues = i + 1; goto fail; } rx_que->msix = vector; } bzero(buf, sizeof(buf)); for (i = 0; i < vsi->shared->isc_ntxqsets; i++, tx_que++) { snprintf(buf, sizeof(buf), "txq%d", i); iflib_softirq_alloc_generic(ctx, &vsi->rx_queues[i % vsi->shared->isc_nrxqsets].que_irq, IFLIB_INTR_TX, tx_que, tx_que->txr.me, buf); /* TODO: Maybe call a strategy function for this to figure out which * interrupts to map Tx queues to. I don't know if there's an immediately * better way than this other than a user-supplied map, though. */ tx_que->msix = (i % vsi->shared->isc_nrxqsets) + 1; } return (0); fail: iflib_irq_free(ctx, &vsi->irq); rx_que = vsi->rx_queues; for (int i = 0; i < vsi->num_rx_queues; i++, rx_que++) iflib_irq_free(ctx, &rx_que->que_irq); return (err); } /* Enable all interrupts */ static void iavf_if_enable_intr(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; iavf_enable_intr(vsi); } /* Disable all interrupts */ static void iavf_if_disable_intr(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; iavf_disable_intr(vsi); } static int iavf_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct i40e_hw *hw = vsi->hw; struct ixl_rx_queue *rx_que = &vsi->rx_queues[rxqid]; iavf_enable_queue_irq(hw, rx_que->msix - 1); return (0); } static int iavf_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct i40e_hw *hw = vsi->hw; struct ixl_tx_queue *tx_que = &vsi->tx_queues[txqid]; iavf_enable_queue_irq(hw, tx_que->msix - 1); return (0); } static int iavf_if_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int ntxqs, int ntxqsets) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; if_softc_ctx_t scctx = vsi->shared; struct ixl_tx_queue *que; int i, j, error = 0; MPASS(scctx->isc_ntxqsets > 0); MPASS(ntxqs == 1); MPASS(scctx->isc_ntxqsets == ntxqsets); /* Allocate queue structure memory */ if (!(vsi->tx_queues = (struct ixl_tx_queue *) malloc(sizeof(struct ixl_tx_queue) *ntxqsets, M_IAVF, M_NOWAIT | M_ZERO))) { device_printf(iflib_get_dev(ctx), "Unable to allocate TX ring memory\n"); return (ENOMEM); } for (i = 0, que = vsi->tx_queues; i < ntxqsets; i++, que++) { struct tx_ring *txr = &que->txr; txr->me = i; que->vsi = vsi; if (!vsi->enable_head_writeback) { /* Allocate report status array */ if (!(txr->tx_rsq = malloc(sizeof(qidx_t) * scctx->isc_ntxd[0], M_IAVF, M_NOWAIT))) { device_printf(iflib_get_dev(ctx), "failed to allocate tx_rsq memory\n"); error = ENOMEM; goto fail; } /* Init report status array */ for (j = 0; j < scctx->isc_ntxd[0]; j++) txr->tx_rsq[j] = QIDX_INVALID; } /* get the virtual and physical address of the hardware queues */ txr->tail = I40E_QTX_TAIL1(txr->me); txr->tx_base = (struct i40e_tx_desc *)vaddrs[i * ntxqs]; txr->tx_paddr = paddrs[i * ntxqs]; txr->que = que; } return (0); fail: iavf_if_queues_free(ctx); return (error); } static int iavf_if_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int nrxqs, int nrxqsets) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct ixl_rx_queue *que; int i, error = 0; #ifdef INVARIANTS if_softc_ctx_t scctx = vsi->shared; MPASS(scctx->isc_nrxqsets > 0); MPASS(nrxqs == 1); MPASS(scctx->isc_nrxqsets == nrxqsets); #endif /* Allocate queue structure memory */ if (!(vsi->rx_queues = (struct ixl_rx_queue *) malloc(sizeof(struct ixl_rx_queue) * nrxqsets, M_IAVF, M_NOWAIT | M_ZERO))) { device_printf(iflib_get_dev(ctx), "Unable to allocate RX ring memory\n"); error = ENOMEM; goto fail; } for (i = 0, que = vsi->rx_queues; i < nrxqsets; i++, que++) { struct rx_ring *rxr = &que->rxr; rxr->me = i; que->vsi = vsi; /* get the virtual and physical address of the hardware queues */ rxr->tail = I40E_QRX_TAIL1(rxr->me); rxr->rx_base = (union i40e_rx_desc *)vaddrs[i * nrxqs]; rxr->rx_paddr = paddrs[i * nrxqs]; rxr->que = que; } return (0); fail: iavf_if_queues_free(ctx); return (error); } static void iavf_if_queues_free(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; if (!vsi->enable_head_writeback) { struct ixl_tx_queue *que; int i = 0; for (i = 0, que = vsi->tx_queues; i < vsi->shared->isc_ntxqsets; i++, que++) { struct tx_ring *txr = &que->txr; if (txr->tx_rsq != NULL) { free(txr->tx_rsq, M_IAVF); txr->tx_rsq = NULL; } } } if (vsi->tx_queues != NULL) { free(vsi->tx_queues, M_IAVF); vsi->tx_queues = NULL; } if (vsi->rx_queues != NULL) { free(vsi->rx_queues, M_IAVF); vsi->rx_queues = NULL; } } static int iavf_check_aq_errors(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; device_t dev = sc->dev; u32 reg, oldreg; u8 aq_error = false; /* check for Admin queue errors */ oldreg = reg = rd32(hw, hw->aq.arq.len); if (reg & I40E_VF_ARQLEN1_ARQVFE_MASK) { device_printf(dev, "ARQ VF Error detected\n"); reg &= ~I40E_VF_ARQLEN1_ARQVFE_MASK; aq_error = true; } if (reg & I40E_VF_ARQLEN1_ARQOVFL_MASK) { device_printf(dev, "ARQ Overflow Error detected\n"); reg &= ~I40E_VF_ARQLEN1_ARQOVFL_MASK; aq_error = true; } if (reg & I40E_VF_ARQLEN1_ARQCRIT_MASK) { device_printf(dev, "ARQ Critical Error detected\n"); reg &= ~I40E_VF_ARQLEN1_ARQCRIT_MASK; aq_error = true; } if (oldreg != reg) wr32(hw, hw->aq.arq.len, reg); oldreg = reg = rd32(hw, hw->aq.asq.len); if (reg & I40E_VF_ATQLEN1_ATQVFE_MASK) { device_printf(dev, "ASQ VF Error detected\n"); reg &= ~I40E_VF_ATQLEN1_ATQVFE_MASK; aq_error = true; } if (reg & I40E_VF_ATQLEN1_ATQOVFL_MASK) { device_printf(dev, "ASQ Overflow Error detected\n"); reg &= ~I40E_VF_ATQLEN1_ATQOVFL_MASK; aq_error = true; } if (reg & I40E_VF_ATQLEN1_ATQCRIT_MASK) { device_printf(dev, "ASQ Critical Error detected\n"); reg &= ~I40E_VF_ATQLEN1_ATQCRIT_MASK; aq_error = true; } if (oldreg != reg) wr32(hw, hw->aq.asq.len, reg); if (aq_error) { device_printf(dev, "WARNING: Stopping VF!\n"); /* * A VF reset might not be enough to fix a problem here; * a PF reset could be required. */ sc->init_state = IAVF_RESET_REQUIRED; iavf_stop(sc); iavf_request_reset(sc); } return (aq_error ? EIO : 0); } static enum i40e_status_code iavf_process_adminq(struct iavf_sc *sc, u16 *pending) { enum i40e_status_code status = I40E_SUCCESS; struct i40e_arq_event_info event; struct i40e_hw *hw = &sc->hw; struct virtchnl_msg *v_msg; int error = 0, loop = 0; u32 reg; error = iavf_check_aq_errors(sc); if (error) return (I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR); event.buf_len = IXL_AQ_BUF_SZ; event.msg_buf = sc->aq_buffer; bzero(event.msg_buf, IXL_AQ_BUF_SZ); v_msg = (struct virtchnl_msg *)&event.desc; /* clean and process any events */ do { status = i40e_clean_arq_element(hw, &event, pending); /* * Also covers normal case when i40e_clean_arq_element() * returns "I40E_ERR_ADMIN_QUEUE_NO_WORK" */ if (status) break; iavf_vc_completion(sc, v_msg->v_opcode, v_msg->v_retval, event.msg_buf, event.msg_len); bzero(event.msg_buf, IXL_AQ_BUF_SZ); } while (*pending && (loop++ < IXL_ADM_LIMIT)); /* Re-enable admin queue interrupt cause */ reg = rd32(hw, I40E_VFINT_ICR0_ENA1); reg |= I40E_VFINT_ICR0_ENA1_ADMINQ_MASK; wr32(hw, I40E_VFINT_ICR0_ENA1, reg); return (status); } static void iavf_if_update_admin_status(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); struct i40e_hw *hw = &sc->hw; u16 pending; iavf_process_adminq(sc, &pending); iavf_update_link_status(sc); /* * If there are still messages to process, reschedule. * Otherwise, re-enable the Admin Queue interrupt. */ if (pending > 0) iflib_admin_intr_deferred(ctx); else iavf_enable_adminq_irq(hw); } static u_int iavf_mc_filter_apply(void *arg, struct sockaddr_dl *sdl, u_int count __unused) { struct iavf_sc *sc = arg; int error; error = iavf_add_mac_filter(sc, (u8*)LLADDR(sdl), IXL_FILTER_MC); return (!error); } static void iavf_if_multi_set(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); IOCTL_DEBUGOUT("iavf_if_multi_set: begin"); if (__predict_false(if_llmaddr_count(iflib_get_ifp(ctx)) >= MAX_MULTICAST_ADDR)) { /* Delete MC filters and enable mulitcast promisc instead */ iavf_init_multi(sc); sc->promisc_flags |= FLAG_VF_MULTICAST_PROMISC; iavf_send_vc_msg(sc, IAVF_FLAG_AQ_CONFIGURE_PROMISC); return; } /* If there aren't too many filters, delete existing MC filters */ iavf_init_multi(sc); /* And (re-)install filters for all mcast addresses */ if (if_foreach_llmaddr(iflib_get_ifp(ctx), iavf_mc_filter_apply, sc) > 0) iavf_send_vc_msg(sc, IAVF_FLAG_AQ_ADD_MAC_FILTER); } static int iavf_if_mtu_set(if_ctx_t ctx, uint32_t mtu) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; IOCTL_DEBUGOUT("ioctl: SIOCSIFMTU (Set Interface MTU)"); if (mtu > IXL_MAX_FRAME - ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN) return (EINVAL); vsi->shared->isc_max_frame_size = mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN; return (0); } static void iavf_if_media_status(if_ctx_t ctx, struct ifmediareq *ifmr) { #ifdef IXL_DEBUG struct ifnet *ifp = iflib_get_ifp(ctx); #endif struct iavf_sc *sc = iflib_get_softc(ctx); INIT_DBG_IF(ifp, "begin"); iavf_update_link_status(sc); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!sc->link_up) return; ifmr->ifm_status |= IFM_ACTIVE; /* Hardware is always full-duplex */ ifmr->ifm_active |= IFM_FDX; /* Based on the link speed reported by the PF over the AdminQ, choose a * PHY type to report. This isn't 100% correct since we don't really * know the underlying PHY type of the PF, but at least we can report * a valid link speed... */ switch (sc->link_speed) { case VIRTCHNL_LINK_SPEED_100MB: ifmr->ifm_active |= IFM_100_TX; break; case VIRTCHNL_LINK_SPEED_1GB: ifmr->ifm_active |= IFM_1000_T; break; case VIRTCHNL_LINK_SPEED_10GB: ifmr->ifm_active |= IFM_10G_SR; break; case VIRTCHNL_LINK_SPEED_20GB: case VIRTCHNL_LINK_SPEED_25GB: ifmr->ifm_active |= IFM_25G_SR; break; case VIRTCHNL_LINK_SPEED_40GB: ifmr->ifm_active |= IFM_40G_SR4; break; default: ifmr->ifm_active |= IFM_UNKNOWN; break; } INIT_DBG_IF(ifp, "end"); } static int iavf_if_media_change(if_ctx_t ctx) { struct ifmedia *ifm = iflib_get_media(ctx); INIT_DEBUGOUT("ixl_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); if_printf(iflib_get_ifp(ctx), "Media change is not supported.\n"); return (ENODEV); } static int iavf_if_promisc_set(if_ctx_t ctx, int flags) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ifnet *ifp = iflib_get_ifp(ctx); sc->promisc_flags = 0; if (flags & IFF_ALLMULTI || if_llmaddr_count(ifp) >= MAX_MULTICAST_ADDR) sc->promisc_flags |= FLAG_VF_MULTICAST_PROMISC; if (flags & IFF_PROMISC) sc->promisc_flags |= FLAG_VF_UNICAST_PROMISC; iavf_send_vc_msg(sc, IAVF_FLAG_AQ_CONFIGURE_PROMISC); return (0); } static void iavf_if_timer(if_ctx_t ctx, uint16_t qid) { struct iavf_sc *sc = iflib_get_softc(ctx); struct i40e_hw *hw = &sc->hw; u32 val; if (qid != 0) return; /* Check for when PF triggers a VF reset */ val = rd32(hw, I40E_VFGEN_RSTAT) & I40E_VFGEN_RSTAT_VFR_STATE_MASK; if (val != VIRTCHNL_VFR_VFACTIVE && val != VIRTCHNL_VFR_COMPLETED) { iavf_dbg_info(sc, "reset in progress! (%d)\n", val); return; } /* Fire off the adminq task */ iflib_admin_intr_deferred(ctx); /* Update stats */ iavf_request_stats(sc); } static void iavf_if_vlan_register(if_ctx_t ctx, u16 vtag) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct iavf_vlan_filter *v; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; ++vsi->num_vlans; v = malloc(sizeof(struct iavf_vlan_filter), M_IAVF, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(sc->vlan_filters, v, next); v->vlan = vtag; v->flags = IXL_FILTER_ADD; iavf_send_vc_msg(sc, IAVF_FLAG_AQ_ADD_VLAN_FILTER); } static void iavf_if_vlan_unregister(if_ctx_t ctx, u16 vtag) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; struct iavf_vlan_filter *v; int i = 0; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; SLIST_FOREACH(v, sc->vlan_filters, next) { if (v->vlan == vtag) { v->flags = IXL_FILTER_DEL; ++i; --vsi->num_vlans; } } if (i) iavf_send_vc_msg(sc, IAVF_FLAG_AQ_DEL_VLAN_FILTER); } static uint64_t iavf_if_get_counter(if_ctx_t ctx, ift_counter cnt) { struct iavf_sc *sc = iflib_get_softc(ctx); struct ixl_vsi *vsi = &sc->vsi; if_t ifp = iflib_get_ifp(ctx); switch (cnt) { case IFCOUNTER_IPACKETS: return (vsi->ipackets); case IFCOUNTER_IERRORS: return (vsi->ierrors); case IFCOUNTER_OPACKETS: return (vsi->opackets); case IFCOUNTER_OERRORS: return (vsi->oerrors); case IFCOUNTER_COLLISIONS: /* Collisions are by standard impossible in 40G/10G Ethernet */ return (0); case IFCOUNTER_IBYTES: return (vsi->ibytes); case IFCOUNTER_OBYTES: return (vsi->obytes); case IFCOUNTER_IMCASTS: return (vsi->imcasts); case IFCOUNTER_OMCASTS: return (vsi->omcasts); case IFCOUNTER_IQDROPS: return (vsi->iqdrops); case IFCOUNTER_OQDROPS: return (vsi->oqdrops); case IFCOUNTER_NOPROTO: return (vsi->noproto); default: return (if_get_counter_default(ifp, cnt)); } } static void iavf_free_pci_resources(struct iavf_sc *sc) { struct ixl_vsi *vsi = &sc->vsi; struct ixl_rx_queue *rx_que = vsi->rx_queues; device_t dev = sc->dev; /* We may get here before stations are set up */ if (rx_que == NULL) goto early; /* Release all interrupts */ iflib_irq_free(vsi->ctx, &vsi->irq); for (int i = 0; i < vsi->num_rx_queues; i++, rx_que++) iflib_irq_free(vsi->ctx, &rx_que->que_irq); early: if (sc->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, rman_get_rid(sc->pci_mem), sc->pci_mem); } /* ** Requests a VF reset from the PF. ** ** Requires the VF's Admin Queue to be initialized. */ static int iavf_reset(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; device_t dev = sc->dev; int error = 0; /* Ask the PF to reset us if we are initiating */ if (sc->init_state != IAVF_RESET_PENDING) iavf_request_reset(sc); i40e_msec_pause(100); error = iavf_reset_complete(hw); if (error) { device_printf(dev, "%s: VF reset failed\n", __func__); return (error); } pci_enable_busmaster(dev); error = i40e_shutdown_adminq(hw); if (error) { device_printf(dev, "%s: shutdown_adminq failed: %d\n", __func__, error); return (error); } error = i40e_init_adminq(hw); if (error) { device_printf(dev, "%s: init_adminq failed: %d\n", __func__, error); return (error); } iavf_enable_adminq_irq(hw); return (0); } static int iavf_reset_complete(struct i40e_hw *hw) { u32 reg; /* Wait up to ~10 seconds */ for (int i = 0; i < 100; i++) { reg = rd32(hw, I40E_VFGEN_RSTAT) & I40E_VFGEN_RSTAT_VFR_STATE_MASK; if ((reg == VIRTCHNL_VFR_VFACTIVE) || (reg == VIRTCHNL_VFR_COMPLETED)) return (0); i40e_msec_pause(100); } return (EBUSY); } static void iavf_setup_interface(device_t dev, struct iavf_sc *sc) { struct ixl_vsi *vsi = &sc->vsi; if_ctx_t ctx = vsi->ctx; struct ifnet *ifp = iflib_get_ifp(ctx); INIT_DBG_DEV(dev, "begin"); vsi->shared->isc_max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN; #if __FreeBSD_version >= 1100000 if_setbaudrate(ifp, IF_Gbps(40)); #else if_initbaudrate(ifp, IF_Gbps(40)); #endif ifmedia_add(vsi->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(vsi->media, IFM_ETHER | IFM_AUTO); } /* ** Get a new filter and add it to the mac filter list. */ static struct iavf_mac_filter * iavf_get_mac_filter(struct iavf_sc *sc) { struct iavf_mac_filter *f; f = malloc(sizeof(struct iavf_mac_filter), M_IAVF, M_NOWAIT | M_ZERO); if (f) SLIST_INSERT_HEAD(sc->mac_filters, f, next); return (f); } /* ** Find the filter with matching MAC address */ static struct iavf_mac_filter * iavf_find_mac_filter(struct iavf_sc *sc, u8 *macaddr) { struct iavf_mac_filter *f; bool match = FALSE; SLIST_FOREACH(f, sc->mac_filters, next) { if (cmp_etheraddr(f->macaddr, macaddr)) { match = TRUE; break; } } if (!match) f = NULL; return (f); } /* ** Admin Queue interrupt handler */ static int iavf_msix_adminq(void *arg) { struct iavf_sc *sc = arg; struct i40e_hw *hw = &sc->hw; u32 reg, mask; bool do_task = FALSE; ++sc->admin_irq; reg = rd32(hw, I40E_VFINT_ICR01); /* * For masking off interrupt causes that need to be handled before * they can be re-enabled */ mask = rd32(hw, I40E_VFINT_ICR0_ENA1); /* Check on the cause */ if (reg & I40E_VFINT_ICR0_ADMINQ_MASK) { mask &= ~I40E_VFINT_ICR0_ENA_ADMINQ_MASK; do_task = TRUE; } wr32(hw, I40E_VFINT_ICR0_ENA1, mask); iavf_enable_adminq_irq(hw); if (do_task) return (FILTER_SCHEDULE_THREAD); else return (FILTER_HANDLED); } void iavf_enable_intr(struct ixl_vsi *vsi) { struct i40e_hw *hw = vsi->hw; struct ixl_rx_queue *que = vsi->rx_queues; iavf_enable_adminq_irq(hw); for (int i = 0; i < vsi->num_rx_queues; i++, que++) iavf_enable_queue_irq(hw, que->rxr.me); } void iavf_disable_intr(struct ixl_vsi *vsi) { struct i40e_hw *hw = vsi->hw; struct ixl_rx_queue *que = vsi->rx_queues; for (int i = 0; i < vsi->num_rx_queues; i++, que++) iavf_disable_queue_irq(hw, que->rxr.me); } static void iavf_disable_adminq_irq(struct i40e_hw *hw) { wr32(hw, I40E_VFINT_DYN_CTL01, 0); wr32(hw, I40E_VFINT_ICR0_ENA1, 0); /* flush */ rd32(hw, I40E_VFGEN_RSTAT); } static void iavf_enable_adminq_irq(struct i40e_hw *hw) { wr32(hw, I40E_VFINT_DYN_CTL01, I40E_VFINT_DYN_CTL01_INTENA_MASK | I40E_VFINT_DYN_CTL01_ITR_INDX_MASK); wr32(hw, I40E_VFINT_ICR0_ENA1, I40E_VFINT_ICR0_ENA1_ADMINQ_MASK); /* flush */ rd32(hw, I40E_VFGEN_RSTAT); } static void iavf_enable_queue_irq(struct i40e_hw *hw, int id) { u32 reg; reg = I40E_VFINT_DYN_CTLN1_INTENA_MASK | I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK | I40E_VFINT_DYN_CTLN1_ITR_INDX_MASK; wr32(hw, I40E_VFINT_DYN_CTLN1(id), reg); } static void iavf_disable_queue_irq(struct i40e_hw *hw, int id) { wr32(hw, I40E_VFINT_DYN_CTLN1(id), I40E_VFINT_DYN_CTLN1_ITR_INDX_MASK); rd32(hw, I40E_VFGEN_RSTAT); } static void iavf_configure_tx_itr(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; struct ixl_vsi *vsi = &sc->vsi; struct ixl_tx_queue *que = vsi->tx_queues; vsi->tx_itr_setting = sc->tx_itr; for (int i = 0; i < vsi->num_tx_queues; i++, que++) { struct tx_ring *txr = &que->txr; wr32(hw, I40E_VFINT_ITRN1(IXL_TX_ITR, i), vsi->tx_itr_setting); txr->itr = vsi->tx_itr_setting; txr->latency = IXL_AVE_LATENCY; } } static void iavf_configure_rx_itr(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; struct ixl_vsi *vsi = &sc->vsi; struct ixl_rx_queue *que = vsi->rx_queues; vsi->rx_itr_setting = sc->rx_itr; for (int i = 0; i < vsi->num_rx_queues; i++, que++) { struct rx_ring *rxr = &que->rxr; wr32(hw, I40E_VFINT_ITRN1(IXL_RX_ITR, i), vsi->rx_itr_setting); rxr->itr = vsi->rx_itr_setting; rxr->latency = IXL_AVE_LATENCY; } } /* * Get initial ITR values from tunable values. */ static void iavf_configure_itr(struct iavf_sc *sc) { iavf_configure_tx_itr(sc); iavf_configure_rx_itr(sc); } /* ** Provide a update to the queue RX ** interrupt moderation value. */ static void iavf_set_queue_rx_itr(struct ixl_rx_queue *que) { struct ixl_vsi *vsi = que->vsi; struct i40e_hw *hw = vsi->hw; struct rx_ring *rxr = &que->rxr; /* Idle, do nothing */ if (rxr->bytes == 0) return; /* Update the hardware if needed */ if (rxr->itr != vsi->rx_itr_setting) { rxr->itr = vsi->rx_itr_setting; wr32(hw, I40E_VFINT_ITRN1(IXL_RX_ITR, que->rxr.me), rxr->itr); } } static int iavf_msix_que(void *arg) { struct ixl_rx_queue *rx_que = arg; ++rx_que->irqs; iavf_set_queue_rx_itr(rx_que); // iavf_set_queue_tx_itr(que); return (FILTER_SCHEDULE_THREAD); } /********************************************************************* * Multicast Initialization * * This routine is called by init to reset a fresh state. * **********************************************************************/ static void iavf_init_multi(struct iavf_sc *sc) { struct iavf_mac_filter *f; int mcnt = 0; /* First clear any multicast filters */ SLIST_FOREACH(f, sc->mac_filters, next) { if ((f->flags & IXL_FILTER_USED) && (f->flags & IXL_FILTER_MC)) { f->flags |= IXL_FILTER_DEL; mcnt++; } } if (mcnt > 0) iavf_send_vc_msg(sc, IAVF_FLAG_AQ_DEL_MAC_FILTER); } /* ** Note: this routine updates the OS on the link state ** the real check of the hardware only happens with ** a link interrupt. */ void iavf_update_link_status(struct iavf_sc *sc) { struct ixl_vsi *vsi = &sc->vsi; u64 baudrate; if (sc->link_up){ if (vsi->link_active == FALSE) { vsi->link_active = TRUE; baudrate = ixl_max_vc_speed_to_value(sc->link_speed); iavf_dbg_info(sc, "baudrate: %lu\n", baudrate); iflib_link_state_change(vsi->ctx, LINK_STATE_UP, baudrate); } } else { /* Link down */ if (vsi->link_active == TRUE) { vsi->link_active = FALSE; iflib_link_state_change(vsi->ctx, LINK_STATE_DOWN, 0); } } } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void iavf_stop(struct iavf_sc *sc) { struct ifnet *ifp; ifp = sc->vsi.ifp; iavf_disable_intr(&sc->vsi); if (atomic_load_acq_32(&sc->queues_enabled)) iavf_send_vc_msg_sleep(sc, IAVF_FLAG_AQ_DISABLE_QUEUES); } static void iavf_if_stop(if_ctx_t ctx) { struct iavf_sc *sc = iflib_get_softc(ctx); iavf_stop(sc); } static void iavf_config_rss_reg(struct iavf_sc *sc) { struct i40e_hw *hw = &sc->hw; struct ixl_vsi *vsi = &sc->vsi; u32 lut = 0; u64 set_hena = 0, hena; int i, j, que_id; u32 rss_seed[IXL_RSS_KEY_SIZE_REG]; #ifdef RSS u32 rss_hash_config; #endif /* Don't set up RSS if using a single queue */ if (vsi->num_rx_queues == 1) { wr32(hw, I40E_VFQF_HENA(0), 0); wr32(hw, I40E_VFQF_HENA(1), 0); ixl_flush(hw); return; } #ifdef RSS /* Fetch the configured RSS key */ rss_getkey((uint8_t *) &rss_seed); #else ixl_get_default_rss_key(rss_seed); #endif /* Fill out hash function seed */ for (i = 0; i < IXL_RSS_KEY_SIZE_REG; i++) wr32(hw, I40E_VFQF_HKEY(i), rss_seed[i]); /* Enable PCTYPES for RSS: */ #ifdef RSS rss_hash_config = rss_gethashconfig(); if (rss_hash_config & RSS_HASHTYPE_RSS_IPV4) set_hena |= ((u64)1 << I40E_FILTER_PCTYPE_NONF_IPV4_OTHER); if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV4) set_hena |= ((u64)1 << I40E_FILTER_PCTYPE_NONF_IPV4_TCP); if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV4) set_hena |= ((u64)1 << I40E_FILTER_PCTYPE_NONF_IPV4_UDP); if (rss_hash_config & RSS_HASHTYPE_RSS_IPV6) set_hena |= ((u64)1 << I40E_FILTER_PCTYPE_NONF_IPV6_OTHER); if (rss_hash_config & RSS_HASHTYPE_RSS_IPV6_EX) set_hena |= ((u64)1 << I40E_FILTER_PCTYPE_FRAG_IPV6); if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV6) set_hena |= ((u64)1 << I40E_FILTER_PCTYPE_NONF_IPV6_TCP); if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV6) set_hena |= ((u64)1 << I40E_FILTER_PCTYPE_NONF_IPV6_UDP); #else set_hena = IXL_DEFAULT_RSS_HENA_XL710; #endif hena = (u64)rd32(hw, I40E_VFQF_HENA(0)) | ((u64)rd32(hw, I40E_VFQF_HENA(1)) << 32); hena |= set_hena; wr32(hw, I40E_VFQF_HENA(0), (u32)hena); wr32(hw, I40E_VFQF_HENA(1), (u32)(hena >> 32)); /* Populate the LUT with max no. of queues in round robin fashion */ for (i = 0, j = 0; i < IXL_RSS_VSI_LUT_SIZE; i++, j++) { if (j == vsi->num_rx_queues) j = 0; #ifdef RSS /* * Fetch the RSS bucket id for the given indirection entry. * Cap it at the number of configured buckets (which is * num_rx_queues.) */ que_id = rss_get_indirection_to_bucket(i); que_id = que_id % vsi->num_rx_queues; #else que_id = j; #endif /* lut = 4-byte sliding window of 4 lut entries */ lut = (lut << 8) | (que_id & IXL_RSS_VF_LUT_ENTRY_MASK); /* On i = 3, we have 4 entries in lut; write to the register */ if ((i & 3) == 3) { wr32(hw, I40E_VFQF_HLUT(i >> 2), lut); DDPRINTF(sc->dev, "HLUT(%2d): %#010x", i, lut); } } ixl_flush(hw); } static void iavf_config_rss_pf(struct iavf_sc *sc) { iavf_send_vc_msg(sc, IAVF_FLAG_AQ_CONFIG_RSS_KEY); iavf_send_vc_msg(sc, IAVF_FLAG_AQ_SET_RSS_HENA); iavf_send_vc_msg(sc, IAVF_FLAG_AQ_CONFIG_RSS_LUT); } /* ** iavf_config_rss - setup RSS ** ** RSS keys and table are cleared on VF reset. */ static void iavf_config_rss(struct iavf_sc *sc) { if (sc->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_RSS_REG) { iavf_dbg_info(sc, "Setting up RSS using VF registers..."); iavf_config_rss_reg(sc); } else if (sc->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_RSS_PF) { iavf_dbg_info(sc, "Setting up RSS using messages to PF..."); iavf_config_rss_pf(sc); } else device_printf(sc->dev, "VF does not support RSS capability sent by PF.\n"); } /* ** This routine adds new MAC filters to the sc's list; ** these are later added in hardware by sending a virtual ** channel message. */ static int iavf_add_mac_filter(struct iavf_sc *sc, u8 *macaddr, u16 flags) { struct iavf_mac_filter *f; /* Does one already exist? */ f = iavf_find_mac_filter(sc, macaddr); if (f != NULL) { iavf_dbg_filter(sc, "exists: " MAC_FORMAT "\n", MAC_FORMAT_ARGS(macaddr)); return (EEXIST); } /* If not, get a new empty filter */ f = iavf_get_mac_filter(sc); if (f == NULL) { device_printf(sc->dev, "%s: no filters available!!\n", __func__); return (ENOMEM); } iavf_dbg_filter(sc, "marked: " MAC_FORMAT "\n", MAC_FORMAT_ARGS(macaddr)); bcopy(macaddr, f->macaddr, ETHER_ADDR_LEN); f->flags |= (IXL_FILTER_ADD | IXL_FILTER_USED); f->flags |= flags; return (0); } /* ** Marks a MAC filter for deletion. */ static int iavf_del_mac_filter(struct iavf_sc *sc, u8 *macaddr) { struct iavf_mac_filter *f; f = iavf_find_mac_filter(sc, macaddr); if (f == NULL) return (ENOENT); f->flags |= IXL_FILTER_DEL; return (0); } /* * Re-uses the name from the PF driver. */ static void iavf_add_device_sysctls(struct iavf_sc *sc) { struct ixl_vsi *vsi = &sc->vsi; device_t dev = sc->dev; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid_list *ctx_list = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); struct sysctl_oid *debug_node; struct sysctl_oid_list *debug_list; SYSCTL_ADD_PROC(ctx, ctx_list, OID_AUTO, "current_speed", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, iavf_sysctl_current_speed, "A", "Current Port Speed"); SYSCTL_ADD_PROC(ctx, ctx_list, OID_AUTO, "tx_itr", CTLTYPE_INT | CTLFLAG_RW, sc, 0, iavf_sysctl_tx_itr, "I", "Immediately set TX ITR value for all queues"); SYSCTL_ADD_PROC(ctx, ctx_list, OID_AUTO, "rx_itr", CTLTYPE_INT | CTLFLAG_RW, sc, 0, iavf_sysctl_rx_itr, "I", "Immediately set RX ITR value for all queues"); /* Add sysctls meant to print debug information, but don't list them * in "sysctl -a" output. */ debug_node = SYSCTL_ADD_NODE(ctx, ctx_list, OID_AUTO, "debug", CTLFLAG_RD | CTLFLAG_SKIP, NULL, "Debug Sysctls"); debug_list = SYSCTL_CHILDREN(debug_node); SYSCTL_ADD_UINT(ctx, debug_list, OID_AUTO, "shared_debug_mask", CTLFLAG_RW, &sc->hw.debug_mask, 0, "Shared code debug message level"); SYSCTL_ADD_UINT(ctx, debug_list, OID_AUTO, "core_debug_mask", CTLFLAG_RW, &sc->dbg_mask, 0, "Non-shared code debug message level"); SYSCTL_ADD_PROC(ctx, debug_list, OID_AUTO, "filter_list", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, iavf_sysctl_sw_filter_list, "A", "SW Filter List"); SYSCTL_ADD_PROC(ctx, debug_list, OID_AUTO, "queue_interrupt_table", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, iavf_sysctl_queue_interrupt_table, "A", "View MSI-X indices for TX/RX queues"); SYSCTL_ADD_PROC(ctx, debug_list, OID_AUTO, "do_vf_reset", CTLTYPE_INT | CTLFLAG_WR, sc, 0, iavf_sysctl_vf_reset, "A", "Request a VF reset from PF"); SYSCTL_ADD_PROC(ctx, debug_list, OID_AUTO, "do_vflr_reset", CTLTYPE_INT | CTLFLAG_WR, sc, 0, iavf_sysctl_vflr_reset, "A", "Request a VFLR reset from HW"); /* Add stats sysctls */ ixl_add_vsi_sysctls(dev, vsi, ctx, "vsi"); ixl_add_queues_sysctls(dev, vsi); } static void iavf_init_filters(struct iavf_sc *sc) { sc->mac_filters = malloc(sizeof(struct mac_list), M_IAVF, M_WAITOK | M_ZERO); SLIST_INIT(sc->mac_filters); sc->vlan_filters = malloc(sizeof(struct vlan_list), M_IAVF, M_WAITOK | M_ZERO); SLIST_INIT(sc->vlan_filters); } static void iavf_free_filters(struct iavf_sc *sc) { struct iavf_mac_filter *f; struct iavf_vlan_filter *v; while (!SLIST_EMPTY(sc->mac_filters)) { f = SLIST_FIRST(sc->mac_filters); SLIST_REMOVE_HEAD(sc->mac_filters, next); free(f, M_IAVF); } free(sc->mac_filters, M_IAVF); while (!SLIST_EMPTY(sc->vlan_filters)) { v = SLIST_FIRST(sc->vlan_filters); SLIST_REMOVE_HEAD(sc->vlan_filters, next); free(v, M_IAVF); } free(sc->vlan_filters, M_IAVF); } char * iavf_vc_speed_to_string(enum virtchnl_link_speed link_speed) { int index; char *speeds[] = { "Unknown", "100 Mbps", "1 Gbps", "10 Gbps", "40 Gbps", "20 Gbps", "25 Gbps", }; switch (link_speed) { case VIRTCHNL_LINK_SPEED_100MB: index = 1; break; case VIRTCHNL_LINK_SPEED_1GB: index = 2; break; case VIRTCHNL_LINK_SPEED_10GB: index = 3; break; case VIRTCHNL_LINK_SPEED_40GB: index = 4; break; case VIRTCHNL_LINK_SPEED_20GB: index = 5; break; case VIRTCHNL_LINK_SPEED_25GB: index = 6; break; case VIRTCHNL_LINK_SPEED_UNKNOWN: default: index = 0; break; } return speeds[index]; } static int iavf_sysctl_current_speed(SYSCTL_HANDLER_ARGS) { struct iavf_sc *sc = (struct iavf_sc *)arg1; int error = 0; error = sysctl_handle_string(oidp, iavf_vc_speed_to_string(sc->link_speed), 8, req); return (error); } /* * Sanity check and save off tunable values. */ static void iavf_save_tunables(struct iavf_sc *sc) { device_t dev = sc->dev; /* Save tunable information */ sc->dbg_mask = iavf_core_debug_mask; sc->hw.debug_mask = iavf_shared_debug_mask; sc->vsi.enable_head_writeback = !!(iavf_enable_head_writeback); if (iavf_tx_itr < 0 || iavf_tx_itr > IXL_MAX_ITR) { device_printf(dev, "Invalid tx_itr value of %d set!\n", iavf_tx_itr); device_printf(dev, "tx_itr must be between %d and %d, " "inclusive\n", 0, IXL_MAX_ITR); device_printf(dev, "Using default value of %d instead\n", IXL_ITR_4K); sc->tx_itr = IXL_ITR_4K; } else sc->tx_itr = iavf_tx_itr; if (iavf_rx_itr < 0 || iavf_rx_itr > IXL_MAX_ITR) { device_printf(dev, "Invalid rx_itr value of %d set!\n", iavf_rx_itr); device_printf(dev, "rx_itr must be between %d and %d, " "inclusive\n", 0, IXL_MAX_ITR); device_printf(dev, "Using default value of %d instead\n", IXL_ITR_8K); sc->rx_itr = IXL_ITR_8K; } else sc->rx_itr = iavf_rx_itr; } /* * Used to set the Tx ITR value for all of the VF's queues. * Writes to the ITR registers immediately. */ static int iavf_sysctl_tx_itr(SYSCTL_HANDLER_ARGS) { struct iavf_sc *sc = (struct iavf_sc *)arg1; device_t dev = sc->dev; int requested_tx_itr; int error = 0; requested_tx_itr = sc->tx_itr; error = sysctl_handle_int(oidp, &requested_tx_itr, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (requested_tx_itr < 0 || requested_tx_itr > IXL_MAX_ITR) { device_printf(dev, "Invalid TX itr value; value must be between 0 and %d\n", IXL_MAX_ITR); return (EINVAL); } sc->tx_itr = requested_tx_itr; iavf_configure_tx_itr(sc); return (error); } /* * Used to set the Rx ITR value for all of the VF's queues. * Writes to the ITR registers immediately. */ static int iavf_sysctl_rx_itr(SYSCTL_HANDLER_ARGS) { struct iavf_sc *sc = (struct iavf_sc *)arg1; device_t dev = sc->dev; int requested_rx_itr; int error = 0; requested_rx_itr = sc->rx_itr; error = sysctl_handle_int(oidp, &requested_rx_itr, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (requested_rx_itr < 0 || requested_rx_itr > IXL_MAX_ITR) { device_printf(dev, "Invalid RX itr value; value must be between 0 and %d\n", IXL_MAX_ITR); return (EINVAL); } sc->rx_itr = requested_rx_itr; iavf_configure_rx_itr(sc); return (error); } static int iavf_sysctl_sw_filter_list(SYSCTL_HANDLER_ARGS) { struct iavf_sc *sc = (struct iavf_sc *)arg1; struct iavf_mac_filter *f; struct iavf_vlan_filter *v; device_t dev = sc->dev; int ftl_len, ftl_counter = 0, error = 0; struct sbuf *buf; buf = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (!buf) { device_printf(dev, "Could not allocate sbuf for output.\n"); return (ENOMEM); } sbuf_printf(buf, "\n"); /* Print MAC filters */ sbuf_printf(buf, "MAC Filters:\n"); ftl_len = 0; SLIST_FOREACH(f, sc->mac_filters, next) ftl_len++; if (ftl_len < 1) sbuf_printf(buf, "(none)\n"); else { SLIST_FOREACH(f, sc->mac_filters, next) { sbuf_printf(buf, MAC_FORMAT ", flags %#06x\n", MAC_FORMAT_ARGS(f->macaddr), f->flags); } } /* Print VLAN filters */ sbuf_printf(buf, "VLAN Filters:\n"); ftl_len = 0; SLIST_FOREACH(v, sc->vlan_filters, next) ftl_len++; if (ftl_len < 1) sbuf_printf(buf, "(none)"); else { SLIST_FOREACH(v, sc->vlan_filters, next) { sbuf_printf(buf, "%d, flags %#06x", v->vlan, v->flags); /* don't print '\n' for last entry */ if (++ftl_counter != ftl_len) sbuf_printf(buf, "\n"); } } error = sbuf_finish(buf); if (error) device_printf(dev, "Error finishing sbuf: %d\n", error); sbuf_delete(buf); return (error); } /* * Print out mapping of TX queue indexes and Rx queue indexes * to MSI-X vectors. */ static int iavf_sysctl_queue_interrupt_table(SYSCTL_HANDLER_ARGS) { struct iavf_sc *sc = (struct iavf_sc *)arg1; struct ixl_vsi *vsi = &sc->vsi; device_t dev = sc->dev; struct sbuf *buf; int error = 0; struct ixl_rx_queue *rx_que = vsi->rx_queues; struct ixl_tx_queue *tx_que = vsi->tx_queues; buf = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (!buf) { device_printf(dev, "Could not allocate sbuf for output.\n"); return (ENOMEM); } sbuf_cat(buf, "\n"); for (int i = 0; i < vsi->num_rx_queues; i++) { rx_que = &vsi->rx_queues[i]; sbuf_printf(buf, "(rxq %3d): %d\n", i, rx_que->msix); } for (int i = 0; i < vsi->num_tx_queues; i++) { tx_que = &vsi->tx_queues[i]; sbuf_printf(buf, "(txq %3d): %d\n", i, tx_que->msix); } error = sbuf_finish(buf); if (error) device_printf(dev, "Error finishing sbuf: %d\n", error); sbuf_delete(buf); return (error); } #define CTX_ACTIVE(ctx) ((if_getdrvflags(iflib_get_ifp(ctx)) & IFF_DRV_RUNNING)) static int iavf_sysctl_vf_reset(SYSCTL_HANDLER_ARGS) { struct iavf_sc *sc = (struct iavf_sc *)arg1; int do_reset = 0, error = 0; error = sysctl_handle_int(oidp, &do_reset, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (do_reset == 1) { iavf_reset(sc); if (CTX_ACTIVE(sc->vsi.ctx)) iflib_request_reset(sc->vsi.ctx); } return (error); } static int iavf_sysctl_vflr_reset(SYSCTL_HANDLER_ARGS) { struct iavf_sc *sc = (struct iavf_sc *)arg1; device_t dev = sc->dev; int do_reset = 0, error = 0; error = sysctl_handle_int(oidp, &do_reset, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (do_reset == 1) { if (!pcie_flr(dev, max(pcie_get_max_completion_timeout(dev) / 1000, 10), true)) { device_printf(dev, "PCIE FLR failed\n"); error = EIO; } else if (CTX_ACTIVE(sc->vsi.ctx)) iflib_request_reset(sc->vsi.ctx); } return (error); } #undef CTX_ACTIVE Index: head/sys/dev/le/lancevar.h =================================================================== --- head/sys/dev/le/lancevar.h (revision 357663) +++ head/sys/dev/le/lancevar.h (revision 357664) @@ -1,211 +1,211 @@ /* $NetBSD: lancevar.h,v 1.10 2005/12/11 12:21:27 christos Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace * Simulation Facility, NASA Ames Research Center. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* $FreeBSD$ */ #ifndef _DEV_LE_LANCEVAR_H_ #define _DEV_LE_LANCEVAR_H_ extern devclass_t le_devclass; struct lance_softc { struct ifnet *sc_ifp; struct ifmedia sc_media; struct mtx sc_mtx; struct callout sc_wdog_ch; int sc_wdog_timer; /* * Memory functions: * * copy to/from descriptor * copy to/from buffer * zero bytes in buffer */ void (*sc_copytodesc)(struct lance_softc *, void *, int, int); void (*sc_copyfromdesc)(struct lance_softc *, void *, int, int); void (*sc_copytobuf)(struct lance_softc *, void *, int, int); void (*sc_copyfrombuf)(struct lance_softc *, void *, int, int); void (*sc_zerobuf)(struct lance_softc *, int, int); /* * Machine-dependent functions: * * read/write CSR * hardware reset hook - may be NULL * hardware init hook - may be NULL * no carrier hook - may be NULL * media change hook - may be NULL */ uint16_t (*sc_rdcsr)(struct lance_softc *, uint16_t); void (*sc_wrcsr)(struct lance_softc *, uint16_t, uint16_t); void (*sc_hwreset)(struct lance_softc *); void (*sc_hwinit)(struct lance_softc *); int (*sc_hwintr)(struct lance_softc *); void (*sc_nocarrier)(struct lance_softc *); int (*sc_mediachange)(struct lance_softc *); void (*sc_mediastatus)(struct lance_softc *, struct ifmediareq *); /* * Media-supported by this interface. If this is NULL, * the only supported media is assumed to be "manual". */ const int *sc_supmedia; int sc_nsupmedia; int sc_defaultmedia; uint16_t sc_conf3; /* CSR3 value */ void *sc_mem; /* base address of RAM - CPU's view */ bus_addr_t sc_addr; /* base address of RAM - LANCE's view */ bus_size_t sc_memsize; /* size of RAM */ int sc_nrbuf; /* number of receive buffers */ int sc_ntbuf; /* number of transmit buffers */ int sc_last_rd; int sc_first_td; int sc_last_td; int sc_no_td; int sc_initaddr; int sc_rmdaddr; int sc_tmdaddr; int sc_rbufaddr; int sc_tbufaddr; uint8_t sc_enaddr[ETHER_ADDR_LEN]; void (*sc_meminit)(struct lance_softc *); void (*sc_start_locked)(struct lance_softc *); int sc_flags; #define LE_ALLMULTI (1 << 0) #define LE_BSWAP (1 << 1) #define LE_CARRIER (1 << 2) #define LE_DEBUG (1 << 3) #define LE_PROMISC (1 << 4) }; #define LE_LOCK_INIT(_sc, _name) \ mtx_init(&(_sc)->sc_mtx, _name, MTX_NETWORK_LOCK, MTX_DEF) #define LE_LOCK_INITIALIZED(_sc) mtx_initialized(&(_sc)->sc_mtx) #define LE_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx) #define LE_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx) #define LE_LOCK_ASSERT(_sc, _what) mtx_assert(&(_sc)->sc_mtx, (_what)) #define LE_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->sc_mtx) /* * Unfortunately, manual byte swapping is only necessary for the PCnet-PCI * variants but not for the original LANCE or ILACC so we cannot do this * with #ifdefs resolved at compile time. */ #define LE_HTOLE16(v) (((sc)->sc_flags & LE_BSWAP) ? htole16(v) : (v)) #define LE_HTOLE32(v) (((sc)->sc_flags & LE_BSWAP) ? htole32(v) : (v)) #define LE_LE16TOH(v) (((sc)->sc_flags & LE_BSWAP) ? le16toh(v) : (v)) #define LE_LE32TOH(v) (((sc)->sc_flags & LE_BSWAP) ? le32toh(v) : (v)) int lance_config(struct lance_softc *, const char*, int); void lance_attach(struct lance_softc *); void lance_detach(struct lance_softc *); void lance_suspend(struct lance_softc *); void lance_resume(struct lance_softc *); void lance_init_locked(struct lance_softc *); int lance_put(struct lance_softc *, int, struct mbuf *); struct mbuf *lance_get(struct lance_softc *, int, int); void lance_setladrf(struct lance_softc *, u_int16_t *); /* * The following functions are only useful on certain CPU/bus * combinations. They should be written in assembly language for * maximum efficiency, but machine-independent versions are provided * for drivers that have not yet been optimized. */ void lance_copytobuf_contig(struct lance_softc *, void *, int, int); void lance_copyfrombuf_contig(struct lance_softc *, void *, int, int); void lance_zerobuf_contig(struct lance_softc *, int, int); #if 0 /* Example only - see lance.c */ void lance_copytobuf_gap2(struct lance_softc *, void *, int, int); void lance_copyfrombuf_gap2(struct lance_softc *, void *, int, int); void lance_zerobuf_gap2(struct lance_softc *, int, int); void lance_copytobuf_gap16(struct lance_softc *, void *, int, int); void lance_copyfrombuf_gap16(struct lance_softc *, void *, int, int); void lance_zerobuf_gap16(struct lance_softc *, int, int); #endif /* Example only */ /* * Compare two Ether/802 addresses for equality, inlined and * unrolled for speed. Use this like memcmp(). * * XXX: Add for stuff like this? * XXX: or maybe add it to libkern.h instead? * * "I'd love to have an inline assembler version of this." * XXX: Who wanted that? mycroft? I wrote one, but this * version in C is as good as hand-coded assembly. -gwr * * Please do NOT tweak this without looking at the actual * assembly code generated before and after your tweaks! */ static inline uint16_t ether_cmp(void *one, void *two) { uint16_t *a = (u_short *)one; uint16_t *b = (u_short *)two; uint16_t diff; #ifdef m68k /* * The post-increment-pointer form produces the best * machine code for m68k. This was carefully tuned * so it compiles to just 8 short (2-byte) op-codes! */ diff = *a++ - *b++; diff |= *a++ - *b++; diff |= *a++ - *b++; #else /* - * Most modern CPUs do better with a single expresion. + * Most modern CPUs do better with a single expression. * Note that short-cut evaluation is NOT helpful here, * because it just makes the code longer, not faster! */ diff = (a[0] - b[0]) | (a[1] - b[1]) | (a[2] - b[2]); #endif return (diff); } #endif /* _DEV_LE_LANCEVAR_H_ */ Index: head/sys/dev/pms/RefTisa/discovery/dm/dmdisc.c =================================================================== --- head/sys/dev/pms/RefTisa/discovery/dm/dmdisc.c (revision 357663) +++ head/sys/dev/pms/RefTisa/discovery/dm/dmdisc.c (revision 357664) @@ -1,7466 +1,7466 @@ /******************************************************************************* ** *Copyright (c) 2014 PMC-Sierra, Inc. All rights reserved. * *Redistribution and use in source and binary forms, with or without modification, are permitted provided *that the following conditions are met: *1. Redistributions of source code must retain the above copyright notice, this list of conditions and the *following disclaimer. *2. Redistributions in binary form must reproduce the above copyright notice, *this list of conditions and the following disclaimer in the documentation and/or other materials provided *with the distribution. * *THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED *WARRANTIES,INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS *FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT *NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR *BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT *LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE ** ********************************************************************************/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #ifdef FDS_DM #include #include #include #include #include #include /*****************************************************************************/ /*! \brief dmDiscover * * * Purpose: A discovery is started by this function * * \param dmRoot: DM context handle. * \param dmPortContext: Pointer to this instance of port context * \param option: Discovery option * * \return: * DM_RC_SUCCESS * DM_RC_FAILURE * */ /*****************************************************************************/ osGLOBAL bit32 dmDiscover( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext, bit32 option) { dmIntPortContext_t *onePortContext = agNULL; bit32 ret = DM_RC_FAILURE; DM_DBG3(("dmDiscover: start\n")); onePortContext = (dmIntPortContext_t *)dmPortContext->dmData; if (onePortContext == agNULL) { DM_DBG1(("dmDiscover: onePortContext is NULL!!!\n")); return DM_RC_FAILURE; } if (onePortContext->valid == agFALSE) { DM_DBG1(("dmDiscover: invalid port!!!\n")); return DM_RC_FAILURE; } if (onePortContext->RegFailed == agTRUE) { DM_DBG1(("dmDiscover: Registration failed!!!\n")); return DM_RC_FAILURE; } switch ( option ) { case DM_DISCOVERY_OPTION_FULL_START: DM_DBG3(("dmDiscover: full, pid %d\n", onePortContext->id)); onePortContext->discovery.type = DM_DISCOVERY_OPTION_FULL_START; dmDiscoveryResetMCN(dmRoot, onePortContext); ret = dmFullDiscover(dmRoot, onePortContext); break; case DM_DISCOVERY_OPTION_INCREMENTAL_START: DM_DBG3(("dmDiscover: incremental, pid %d\n", onePortContext->id)); onePortContext->discovery.type = DM_DISCOVERY_OPTION_INCREMENTAL_START; dmDiscoveryResetMCN(dmRoot, onePortContext); ret = dmIncrementalDiscover(dmRoot, onePortContext, agFALSE); break; case DM_DISCOVERY_OPTION_ABORT: DM_DBG3(("dmDiscover: abort\n")); if (onePortContext->DiscoveryState != DM_DSTATE_COMPLETED) { if (onePortContext->discovery.pendingSMP == 0) { dmDiscoverAbort(dmRoot, onePortContext); tddmDiscoverCB( dmRoot, onePortContext->dmPortContext, dmDiscAborted ); } else { DM_DBG3(("dmDiscover: abortInProgress\n")); onePortContext->DiscoveryAbortInProgress = agTRUE; tddmDiscoverCB( dmRoot, dmPortContext, dmDiscAbortInProgress ); } } else { DM_DBG3(("dmDiscover: no discovery to abort\n")); tddmDiscoverCB( dmRoot, dmPortContext, dmDiscAbortInvalid ); } ret = DM_RC_SUCCESS; break; default: break; } return ret; } osGLOBAL bit32 dmFullDiscover( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmExpander_t *oneExpander = agNULL; dmSASSubID_t dmSASSubID; dmDeviceData_t *oneExpDeviceData = agNULL; DM_DBG1(("dmFullDiscover: start\n")); if (onePortContext->valid == agFALSE) { DM_DBG1(("dmFullDiscover: invalid port!!!\n")); return DM_RC_FAILURE; } if (onePortContext->DiscoveryState == DM_DSTATE_STARTED) { DM_DBG1(("dmFullDiscover: no two instances of discovery allowed!!!\n")); return DM_RC_FAILURE; } onePortContext->DiscoveryState = DM_DSTATE_STARTED; dmSASSubID.sasAddressHi = onePortContext->sasRemoteAddressHi; dmSASSubID.sasAddressLo = onePortContext->sasRemoteAddressLo; /* check OnePortContext->discovery.discoveringExpanderList */ oneExpander = dmExpFind(dmRoot, onePortContext, dmSASSubID.sasAddressHi, dmSASSubID.sasAddressLo); if (oneExpander != agNULL) { oneExpDeviceData = oneExpander->dmDevice; } else { /* check dmAllShared->mainExpanderList */ oneExpander = dmExpMainListFind(dmRoot, onePortContext, dmSASSubID.sasAddressHi, dmSASSubID.sasAddressLo); if (oneExpander != agNULL) { oneExpDeviceData = oneExpander->dmDevice; } } if (oneExpDeviceData != agNULL) { dmSASSubID.initiator_ssp_stp_smp = oneExpDeviceData->initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = oneExpDeviceData->target_ssp_stp_smp; oneExpDeviceData->registered = agTRUE; dmAddSASToSharedcontext(dmRoot, onePortContext, &dmSASSubID, oneExpDeviceData, 0xFF); } else { DM_DBG1(("dmFullDiscover:oneExpDeviceData is NULL!!!\n")); return DM_RC_FAILURE; } dmUpStreamDiscoverStart(dmRoot, onePortContext); return DM_RC_SUCCESS; } osGLOBAL bit32 dmIncrementalDiscover( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 flag ) { dmExpander_t *oneExpander = agNULL; dmSASSubID_t dmSASSubID; dmDeviceData_t *oneExpDeviceData = agNULL; DM_DBG1(("dmIncrementalDiscover: start\n")); if (onePortContext->valid == agFALSE) { DM_DBG1(("dmIncrementalDiscover: invalid port!!!\n")); return DM_RC_FAILURE; } /* TDM triggerred; let go DM triggerred */ if (flag == agFALSE) { if (onePortContext->DiscoveryState == DM_DSTATE_STARTED) { DM_DBG1(("dmIncrementalDiscover: no two instances of discovery allowed!!!\n")); return DM_RC_FAILURE; } } onePortContext->DiscoveryState = DM_DSTATE_STARTED; onePortContext->discovery.type = DM_DISCOVERY_OPTION_INCREMENTAL_START; dmSASSubID.sasAddressHi = onePortContext->sasRemoteAddressHi; dmSASSubID.sasAddressLo = onePortContext->sasRemoteAddressLo; /* check OnePortContext->discovery.discoveringExpanderList */ oneExpander = dmExpFind(dmRoot, onePortContext, dmSASSubID.sasAddressHi, dmSASSubID.sasAddressLo); if (oneExpander != agNULL) { oneExpDeviceData = oneExpander->dmDevice; } else { /* check dmAllShared->mainExpanderList */ oneExpander = dmExpMainListFind(dmRoot, onePortContext, dmSASSubID.sasAddressHi, dmSASSubID.sasAddressLo); if (oneExpander != agNULL) { oneExpDeviceData = oneExpander->dmDevice; } } if (oneExpDeviceData != agNULL) { dmSASSubID.initiator_ssp_stp_smp = oneExpDeviceData->initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = oneExpDeviceData->target_ssp_stp_smp; oneExpDeviceData->registered = agTRUE; dmAddSASToSharedcontext(dmRoot, onePortContext, &dmSASSubID, oneExpDeviceData, 0xFF); } else { DM_DBG1(("dmIncrementalDiscover:oneExpDeviceData is NULL!!!\n")); return DM_RC_FAILURE; } dmUpStreamDiscoverStart(dmRoot, onePortContext); return DM_RC_SUCCESS; } osGLOBAL void dmUpStreamDiscoverStart( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { // dmExpander_t *oneExpander = agNULL; bit32 sasAddressHi, sasAddressLo; dmDeviceData_t *oneDeviceData; dmExpander_t *oneExpander = agNULL; DM_DBG3(("dmUpStreamDiscoverStart: start\n")); if (onePortContext->valid == agFALSE) { DM_DBG1(("dmUpStreamDiscoverStart: invalid port!!!\n")); return; } /* at this point, the 1st expander should have been registered. find an expander from onePortContext */ sasAddressHi = onePortContext->sasRemoteAddressHi; sasAddressLo = onePortContext->sasRemoteAddressLo; DM_DBG3(("dmUpStreamDiscoverStart: Port Remote AddrHi 0x%08x Remote AddrLo 0x%08x\n", sasAddressHi, sasAddressLo)); oneDeviceData = dmDeviceFind(dmRoot, onePortContext, sasAddressHi, sasAddressLo); // oneDeviceData = oneExpander->dmDevice; // start here onePortContext->discovery.status = DISCOVERY_UP_STREAM; if (oneDeviceData == agNULL) { DM_DBG1(("dmUpStreamDiscoverStart: oneExpander is NULL, wrong!!!\n")); return; } else { if ( (oneDeviceData->SASSpecDeviceType == SAS_EDGE_EXPANDER_DEVICE) || (oneDeviceData->SASSpecDeviceType == SAS_FANOUT_EXPANDER_DEVICE) || DEVICE_IS_SMP_TARGET(oneDeviceData) ) { #if 1 /* for incremental discovery */ /* start here: if not on discoveringExpanderList, alloc and add dmNewEXPorNot() */ oneExpander = dmExpFind(dmRoot, onePortContext, sasAddressHi, sasAddressLo); if ( oneExpander == agNULL) { /* alloc and add */ oneExpander = dmDiscoveringExpanderAlloc(dmRoot, onePortContext, oneDeviceData); if ( oneExpander != agNULL) { dmDiscoveringExpanderAdd(dmRoot, onePortContext, oneExpander); } else { DM_DBG1(("dmUpStreamDiscoverStart: failed to allocate expander or discovey aborted!!!\n")); return; } } #endif dmUpStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } else { DM_DBG1(("dmUpStreamDiscoverStart: oneDeviceData is not an Expander did %d, wrong!!!\n", oneDeviceData->id)); return; } } return; } /* sends report general */ osGLOBAL void dmUpStreamDiscovering( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData ) { dmList_t *ExpanderList; dmExpander_t *oneNextExpander = agNULL; DM_DBG3(("dmUpStreamDiscovering: start\n")); if (onePortContext->valid == agFALSE) { DM_DBG1(("dmUpStreamDiscovering: invalid port!!!\n")); return; } tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (DMLIST_EMPTY(&(onePortContext->discovery.discoveringExpanderList))) { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); DM_DBG3(("dmUpStreamDiscovering: should be the end\n")); oneNextExpander = agNULL; } else { DMLIST_DEQUEUE_FROM_HEAD(&ExpanderList, &(onePortContext->discovery.discoveringExpanderList)); oneNextExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if ( oneNextExpander != agNULL) { DMLIST_ENQUEUE_AT_HEAD(&(oneNextExpander->linkNode), &(onePortContext->discovery.discoveringExpanderList)); DM_DBG3(("dmUpStreamDiscovering tdsaSASUpStreamDiscovering: dequeue head\n")); DM_DBG3(("dmUpStreamDiscovering: expander id %d\n", oneNextExpander->id)); } else { DM_DBG1(("dmUpStreamDiscovering: oneNextExpander is NULL!!!\n")); } tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } if (oneNextExpander != agNULL) { dmReportGeneralSend(dmRoot, oneNextExpander->dmDevice); } else { DM_DBG3(("dmUpStreamDiscovering: No more expander list\n")); dmDownStreamDiscoverStart(dmRoot, onePortContext, oneDeviceData); } return; } osGLOBAL void dmDownStreamDiscoverStart( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData ) { dmExpander_t *UpStreamExpander; dmExpander_t *oneExpander; DM_DBG3(("dmDownStreamDiscoverStart: start\n")); if (dmDiscoverCheck(dmRoot, onePortContext) == agTRUE) { DM_DBG1(("dmDownStreamDiscoverStart: invalid port or aborted discovery!!!\n")); return; } /* set discovery status */ onePortContext->discovery.status = DISCOVERY_DOWN_STREAM; /* If it's an expander */ if ( (oneDeviceData->SASSpecDeviceType == SAS_EDGE_EXPANDER_DEVICE) || (oneDeviceData->SASSpecDeviceType == SAS_FANOUT_EXPANDER_DEVICE) || DEVICE_IS_SMP_TARGET(oneDeviceData) ) { oneExpander = oneDeviceData->dmExpander; UpStreamExpander = oneExpander->dmUpStreamExpander; /* If the two expanders are the root of two edge sets; sub-to-sub */ if ( (UpStreamExpander != agNULL) && ( UpStreamExpander->dmUpStreamExpander == oneExpander ) ) { DM_DBG3(("dmDownStreamDiscoverStart: Root found pExpander=%p pUpStreamExpander=%p\n", oneExpander, UpStreamExpander)); //Saves the root expander onePortContext->discovery.RootExp = oneExpander; DM_DBG3(("dmDownStreamDiscoverStart: Root exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDownStreamDiscoverStart: Root exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); /* reset up stream inform for pExpander */ oneExpander->dmUpStreamExpander = agNULL; /* Add the pExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, oneExpander); /* reset up stream inform for oneExpander */ UpStreamExpander->dmUpStreamExpander = agNULL; /* Add the UpStreamExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, UpStreamExpander); } /* If the two expanders are not the root of two edge sets. eg) one root */ else { //Saves the root expander onePortContext->discovery.RootExp = oneExpander; DM_DBG3(("dmDownStreamDiscoverStart: NO Root pExpander=%p\n", oneExpander)); DM_DBG3(("dmDownStreamDiscoverStart: Root exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDownStreamDiscoverStart: Root exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); /* (2.2.2.1) Add the pExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, oneExpander); } } /* Continue down stream discovering */ dmDownStreamDiscovering(dmRoot, onePortContext, oneDeviceData); return; } osGLOBAL void dmDownStreamDiscovering( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData ) { dmExpander_t *NextExpander = agNULL; dmList_t *ExpanderList; DM_DBG3(("dmDownStreamDiscovering: start\n")); if (dmDiscoverCheck(dmRoot, onePortContext) == agTRUE) { DM_DBG1(("dmDownStreamDiscovering: invalid port or aborted discovery!!!\n")); return; } tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (DMLIST_EMPTY(&(onePortContext->discovery.discoveringExpanderList))) { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); DM_DBG3(("dmDownStreamDiscovering: should be the end\n")); NextExpander = agNULL; } else { DMLIST_DEQUEUE_FROM_HEAD(&ExpanderList, &(onePortContext->discovery.discoveringExpanderList));; NextExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if ( NextExpander != agNULL) { DMLIST_ENQUEUE_AT_HEAD(&(NextExpander->linkNode), &(onePortContext->discovery.discoveringExpanderList));; DM_DBG3(("dmDownStreamDiscovering tdsaSASDownStreamDiscovering: dequeue head\n")); DM_DBG3(("dmDownStreamDiscovering: expander id %d\n", NextExpander->id)); } else { DM_DBG1(("dmDownStreamDiscovering: NextExpander is NULL!!!\n")); } tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } /* If there is an expander for continue discoving */ if ( NextExpander != agNULL) { DM_DBG3(("dmDownStreamDiscovering: Found pNextExpander=%p discoveryStatus=0x%x\n", NextExpander, onePortContext->discovery.status)); switch (onePortContext->discovery.status) { /* If the discovery status is DISCOVERY_DOWN_STREAM */ case DISCOVERY_DOWN_STREAM: /* Send report general for the next expander */ DM_DBG3(("dmDownStreamDiscovering: DownStream pNextExpander=%p\n", NextExpander)); DM_DBG3(("dmDownStreamDiscovering: oneDeviceData %p did %d\n", oneDeviceData, oneDeviceData->id)); DM_DBG3(("dmDownStreamDiscovering: oneExpander %p did %d\n", oneDeviceData->dmExpander, oneDeviceData->dmExpander->id)); DM_DBG3(("dmDownStreamDiscovering: 2nd oneDeviceData %p did %d\n", NextExpander->dmDevice, NextExpander->dmDevice->id)); DM_DBG3(("dmDownStreamDiscovering: 2nd oneExpander %p did %d\n", NextExpander, NextExpander->id)); DM_DBG3(("dmDownStreamDiscovering: 2nd used oneExpander %p did %d\n", NextExpander->dmDevice->dmExpander, NextExpander->dmDevice->dmExpander->id)); if (NextExpander != NextExpander->dmDevice->dmExpander) { DM_DBG3(("dmDownStreamDiscovering: wrong!!!\n")); } dmReportGeneralSend(dmRoot, NextExpander->dmDevice); break; /* If the discovery status is DISCOVERY_CONFIG_ROUTING */ case DISCOVERY_CONFIG_ROUTING: case DISCOVERY_REPORT_PHY_SATA: /* set discovery status */ onePortContext->discovery.status = DISCOVERY_DOWN_STREAM; DM_DBG3(("dmDownStreamDiscovering: pPort->discovery.status=DISCOVERY_CONFIG_ROUTING, make it DOWN_STREAM\n")); /* If not the last phy */ if ( NextExpander->discoveringPhyId < NextExpander->dmDevice->numOfPhys ) { DM_DBG3(("dmDownStreamDiscovering: pNextExpander->discoveringPhyId=0x%x pNextExpander->numOfPhys=0x%x. Send More Discover\n", NextExpander->discoveringPhyId, NextExpander->dmDevice->numOfPhys)); /* Send discover for the next expander */ dmDiscoverSend(dmRoot, NextExpander->dmDevice); } /* If it's the last phy */ else { DM_DBG3(("dmDownStreamDiscovering: Last Phy, remove expander%p start DownStream=%p\n", NextExpander, NextExpander->dmDevice)); dmDiscoveringExpanderRemove(dmRoot, onePortContext, NextExpander); dmDownStreamDiscovering(dmRoot, onePortContext, NextExpander->dmDevice); } break; default: DM_DBG3(("dmDownStreamDiscovering: *** Unknown pPort->discovery.status=0x%x\n", onePortContext->discovery.status)); } } /* If no expander for continue discoving */ else { DM_DBG3(("dmDownStreamDiscovering: No more expander DONE\n")); /* discover done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_SUCCESS); } return; } osGLOBAL void dmUpStreamDiscoverExpanderPhy( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander, smpRespDiscover_t *pDiscoverResp ) { agsaSASIdentify_t sasIdentify; dmSASSubID_t dmSASSubID; bit32 attachedSasHi, attachedSasLo; dmExpander_t *AttachedExpander = agNULL; bit8 connectionRate; dmDeviceData_t *oneDeviceData = agNULL; dmDeviceData_t *AttachedDevice = agNULL; dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; DM_DBG3(("dmUpStreamDiscoverExpanderPhy: start\n")); if (dmDiscoverCheck(dmRoot, onePortContext) == agTRUE) { DM_DBG1(("dmUpStreamDiscoverExpanderPhy: invalid port or aborted discovery!!!\n")); return; } if (oneExpander != oneExpander->dmDevice->dmExpander) { DM_DBG1(("dmUpStreamDiscoverExpanderPhy: wrong!!!\n")); } dm_memset(&sasIdentify, 0, sizeof(agsaSASIdentify_t)); oneDeviceData = oneExpander->dmDevice; DM_DBG3(("dmUpStreamDiscoverExpanderPhy: Phy #%d of SAS %08x-%08x\n", oneExpander->discoveringPhyId, oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3((" Attached device: %s\n", ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 0 ? "No Device" : (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 1 ? "End Device" : (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 2 ? "Edge Expander" : "Fanout Expander"))))); if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { DM_DBG3((" SAS address : %08x-%08x\n", DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp), DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp))); DM_DBG3((" SSP Target : %d\n", DISCRSP_IS_SSP_TARGET(pDiscoverResp)?1:0)); DM_DBG3((" STP Target : %d\n", DISCRSP_IS_STP_TARGET(pDiscoverResp)?1:0)); DM_DBG3((" SMP Target : %d\n", DISCRSP_IS_SMP_TARGET(pDiscoverResp)?1:0)); DM_DBG3((" SATA DEVICE : %d\n", DISCRSP_IS_SATA_DEVICE(pDiscoverResp)?1:0)); DM_DBG3((" SSP Initiator : %d\n", DISCRSP_IS_SSP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG3((" STP Initiator : %d\n", DISCRSP_IS_STP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG3((" SMP Initiator : %d\n", DISCRSP_IS_SMP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG3((" Phy ID : %d\n", pDiscoverResp->phyIdentifier)); DM_DBG3((" Attached Phy ID: %d\n", pDiscoverResp->attachedPhyIdentifier)); } /* for debugging */ if (oneExpander->discoveringPhyId != pDiscoverResp->phyIdentifier) { DM_DBG1(("dmUpStreamDiscoverExpanderPhy: !!! Incorrect SMP response !!!\n")); DM_DBG1(("dmUpStreamDiscoverExpanderPhy: Request PhyID #%d Response PhyID #%d !!!\n", oneExpander->discoveringPhyId, pDiscoverResp->phyIdentifier)); dmhexdump("NO_DEVICE", (bit8*)pDiscoverResp, sizeof(smpRespDiscover_t)); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } /* saving routing attribute for non self-configuring expanders */ oneExpander->routingAttribute[pDiscoverResp->phyIdentifier] = (bit8)DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp); if ( oneDeviceData->SASSpecDeviceType == SAS_FANOUT_EXPANDER_DEVICE ) { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: SA_SAS_DEV_TYPE_FANOUT_EXPANDER\n")); if ( DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE) { DM_DBG1(("dmUpStreamDiscoverExpanderPhy: **** Topology Error subtractive routing on fanout expander device!!!\n")); /* discovery error */ onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmUpStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* (2.1.3) discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } } else { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: SA_SAS_DEV_TYPE_EDGE_EXPANDER\n")); if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { /* Setup sasIdentify for the attached device */ sasIdentify.phyIdentifier = pDiscoverResp->phyIdentifier; sasIdentify.deviceType_addressFrameType = (bit8)(pDiscoverResp->attachedDeviceType & 0x70); sasIdentify.initiator_ssp_stp_smp = pDiscoverResp->attached_Ssp_Stp_Smp_Sata_Initiator; sasIdentify.target_ssp_stp_smp = pDiscoverResp->attached_SataPS_Ssp_Stp_Smp_Sata_Target; *(bit32*)sasIdentify.sasAddressHi = *(bit32*)pDiscoverResp->attachedSasAddressHi; *(bit32*)sasIdentify.sasAddressLo = *(bit32*)pDiscoverResp->attachedSasAddressLo; /* incremental discovery */ dmSASSubID.sasAddressHi = SA_IDFRM_GET_SAS_ADDRESSHI(&sasIdentify); dmSASSubID.sasAddressLo = SA_IDFRM_GET_SAS_ADDRESSLO(&sasIdentify); dmSASSubID.initiator_ssp_stp_smp = sasIdentify.initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = sasIdentify.target_ssp_stp_smp; attachedSasHi = DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp); attachedSasLo = DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp); /* If the phy has subtractive routing attribute */ if ( DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE) { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: SA_SAS_ROUTING_SUBTRACTIVE\n")); /* Setup upstream phys */ dmExpanderUpStreamPhyAdd(dmRoot, oneExpander, (bit8) pDiscoverResp->attachedPhyIdentifier); /* If the expander already has an upsteam device set up */ if (oneExpander->hasUpStreamDevice == agTRUE) { /* just to update MCN */ dmPortSASDeviceFind(dmRoot, onePortContext, attachedSasLo, attachedSasHi, oneDeviceData); /* If the sas address doesn't match */ if ( ((oneExpander->upStreamSASAddressHi != attachedSasHi) || (oneExpander->upStreamSASAddressLo != attachedSasLo)) && (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE || DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) ) { /* TODO: discovery error, callback */ DM_DBG1(("dmUpStreamDiscoverExpanderPhy: **** Topology Error subtractive routing error - inconsistent SAS address!!!\n")); /* call back to notify discovery error */ onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmUpStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } else { /* Setup SAS address for up stream device */ oneExpander->hasUpStreamDevice = agTRUE; oneExpander->upStreamSASAddressHi = attachedSasHi; oneExpander->upStreamSASAddressLo = attachedSasLo; if ( (onePortContext->sasLocalAddressHi != attachedSasHi) || (onePortContext->sasLocalAddressLo != attachedSasLo) ) { /* Find the device from the discovered list */ AttachedDevice = dmPortSASDeviceFind(dmRoot, onePortContext, attachedSasLo, attachedSasHi, oneDeviceData); /* New device, If the device has been discovered before */ if ( AttachedDevice != agNULL) /* old device */ { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: Seen This Device Before\n")); /* If attached device is an edge expander */ if ( AttachedDevice->SASSpecDeviceType == SAS_EDGE_EXPANDER_DEVICE) { /* The attached device is an expander */ AttachedExpander = AttachedDevice->dmExpander; /* If the two expanders are the root of the two edge expander sets */ if ( (AttachedExpander->upStreamSASAddressHi == DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo)) && (AttachedExpander->upStreamSASAddressLo == DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo)) ) { /* Setup upstream expander for the pExpander */ oneExpander->dmUpStreamExpander = AttachedExpander; } /* If the two expanders are not the root of the two edge expander sets */ else { /* TODO: loop found, discovery error, callback */ DM_DBG1(("dmUpStreamDiscoverExpanderPhy: **** Topology Error loop detection!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmUpStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* If attached device is not an edge expander */ else { /*TODO: should not happen, ASSERT */ DM_DBG3(("dmUpStreamDiscoverExpanderPhy, *** Attached Device is not Edge. Confused!!!\n")); } } /* AttachedExpander != agNULL */ /* New device, If the device has not been discovered before */ else /* new device */ { /* Add the device */ DM_DBG3(("dmUpStreamDiscoverExpanderPhy: New device\n")); /* read minimum rate from the configuration onePortContext->LinkRate is SPC's local link rate */ connectionRate = (bit8)MIN(onePortContext->LinkRate, DISCRSP_GET_LINKRATE(pDiscoverResp)); DM_DBG3(("dmUpStreamDiscoverExpanderPhy: link rate 0x%x\n", onePortContext->LinkRate)); DM_DBG3(("dmUpStreamDiscoverExpanderPhy: negotiatedPhyLinkRate 0x%x\n", DISCRSP_GET_LINKRATE(pDiscoverResp))); DM_DBG3(("dmUpStreamDiscoverExpanderPhy: connectionRate 0x%x\n", connectionRate)); if (DISCRSP_IS_STP_TARGET(pDiscoverResp) || DISCRSP_IS_SATA_DEVICE(pDiscoverResp)) { /* incremental discovery */ if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } /* DISCRSP_IS_STP_TARGET(pDiscoverResp) || DISCRSP_IS_SATA_DEVICE(pDiscoverResp) */ else { /* incremental discovery */ if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } /* If the device is added successfully */ if ( AttachedDevice != agNULL) { /* (3.1.2.3.2.3.2.1) callback about new device */ if ( DISCRSP_IS_SSP_TARGET(pDiscoverResp) || DISCRSP_IS_SSP_INITIATOR(pDiscoverResp) || DISCRSP_IS_SMP_INITIATOR(pDiscoverResp) || DISCRSP_IS_SMP_INITIATOR(pDiscoverResp) ) { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: Found SSP/SMP SAS %08x-%08x\n", attachedSasHi, attachedSasLo)); } else { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: Found a SAS STP device.\n")); } /* If the attached device is an expander */ if ( (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) || (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) ) { /* Allocate an expander data structure */ AttachedExpander = dmDiscoveringExpanderAlloc( dmRoot, onePortContext, AttachedDevice ); DM_DBG3(("dmUpStreamDiscoverExpanderPhy: Found expander=%p\n", AttachedExpander)); /* If allocate successfully */ if ( AttachedExpander != agNULL) { /* Add the pAttachedExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, AttachedExpander); /* Setup upstream expander for the pExpander */ oneExpander->dmUpStreamExpander = AttachedExpander; } /* If failed to allocate */ else { DM_DBG1(("dmUpStreamDiscoverExpanderPhy: Failed to allocate expander data structure!!!\n")); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* If the attached device is an end device */ else { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: Found end device\n")); /* LP2006-05-26 added upstream device to the newly found device */ AttachedDevice->dmExpander = oneExpander; oneExpander->dmUpStreamExpander = agNULL; } } else { DM_DBG1(("dmUpStreamDiscoverExpanderPhy: Failed to add a device!!!\n")); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* else, new device */ } /* onePortContext->sasLocalAddressLo != attachedSasLo */ } /* else */ } /* DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE */ } /* DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE */ } /* big else */ oneExpander->discoveringPhyId ++; if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: DISCOVERY_UP_STREAM find more ...\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } else { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: DISCOVERY_UP_STREAM last phy continue upstream..\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue upstream discovering */ dmUpStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG3(("dmUpStreamDiscoverExpanderPhy: onePortContext->discovery.status not in DISCOVERY_UP_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG3(("dmUpStreamDiscoverExpanderPhy: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } osGLOBAL void dmUpStreamDiscover2ExpanderPhy( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander, smpRespDiscover2_t *pDiscoverResp ) { dmDeviceData_t *oneDeviceData; dmDeviceData_t *AttachedDevice = agNULL; dmExpander_t *AttachedExpander; agsaSASIdentify_t sasIdentify; bit8 connectionRate; bit32 attachedSasHi, attachedSasLo; dmSASSubID_t dmSASSubID; dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: start\n")); if (dmDiscoverCheck(dmRoot, onePortContext) == agTRUE) { DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: invalid port or aborted discovery!!!\n")); return; } if (oneExpander != oneExpander->dmDevice->dmExpander) { DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: wrong!!!\n")); } dm_memset(&sasIdentify, 0, sizeof(agsaSASIdentify_t)); oneDeviceData = oneExpander->dmDevice; DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: Phy #%d of SAS %08x-%08x\n", oneExpander->discoveringPhyId, oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG2((" Attached device: %s\n", ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 0 ? "No Device" : (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 1 ? "End Device" : (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 2 ? "Edge Expander" : "Fanout Expander"))))); if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { DM_DBG2((" SAS address : %08x-%08x\n", SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp), SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp))); DM_DBG2((" SSP Target : %d\n", SAS2_DISCRSP_IS_SSP_TARGET(pDiscoverResp)?1:0)); DM_DBG2((" STP Target : %d\n", SAS2_DISCRSP_IS_STP_TARGET(pDiscoverResp)?1:0)); DM_DBG2((" SMP Target : %d\n", SAS2_DISCRSP_IS_SMP_TARGET(pDiscoverResp)?1:0)); DM_DBG2((" SATA DEVICE : %d\n", SAS2_DISCRSP_IS_SATA_DEVICE(pDiscoverResp)?1:0)); DM_DBG2((" SSP Initiator : %d\n", SAS2_DISCRSP_IS_SSP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG2((" STP Initiator : %d\n", SAS2_DISCRSP_IS_STP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG2((" SMP Initiator : %d\n", SAS2_DISCRSP_IS_SMP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG2((" Phy ID : %d\n", pDiscoverResp->phyIdentifier)); DM_DBG2((" Attached Phy ID: %d\n", pDiscoverResp->attachedPhyIdentifier)); } if (oneExpander->discoveringPhyId != pDiscoverResp->phyIdentifier) { DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: !!! Incorrect SMP response !!!\n")); DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: Request PhyID #%d Response PhyID #%d\n", oneExpander->discoveringPhyId, pDiscoverResp->phyIdentifier)); dmhexdump("NO_DEVICE", (bit8*)pDiscoverResp, sizeof(smpRespDiscover2_t)); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } /* saving routing attribute for non self-configuring expanders */ oneExpander->routingAttribute[pDiscoverResp->phyIdentifier] = SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp); if ( oneDeviceData->SASSpecDeviceType == SAS_FANOUT_EXPANDER_DEVICE ) { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: SA_SAS_DEV_TYPE_FANOUT_EXPANDER\n")); if ( SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE) { DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: **** Topology Error subtractive routing on fanout expander device!!!\n")); /* discovery error */ onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* (2.1.3) discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } } else { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: SA_SAS_DEV_TYPE_EDGE_EXPANDER\n")); if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { /* Setup sasIdentify for the attached device */ sasIdentify.phyIdentifier = pDiscoverResp->phyIdentifier; sasIdentify.deviceType_addressFrameType = pDiscoverResp->attachedDeviceTypeReason & 0x70; sasIdentify.initiator_ssp_stp_smp = pDiscoverResp->attached_Ssp_Stp_Smp_Sata_Initiator; sasIdentify.target_ssp_stp_smp = pDiscoverResp->attached_SataPS_Ssp_Stp_Smp_Sata_Target; *(bit32*)sasIdentify.sasAddressHi = *(bit32*)pDiscoverResp->attachedSasAddressHi; *(bit32*)sasIdentify.sasAddressLo = *(bit32*)pDiscoverResp->attachedSasAddressLo; /* incremental discovery */ dmSASSubID.sasAddressHi = SA_IDFRM_GET_SAS_ADDRESSHI(&sasIdentify); dmSASSubID.sasAddressLo = SA_IDFRM_GET_SAS_ADDRESSLO(&sasIdentify); dmSASSubID.initiator_ssp_stp_smp = sasIdentify.initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = sasIdentify.target_ssp_stp_smp; attachedSasHi = SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp); attachedSasLo = SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp); /* If the phy has subtractive routing attribute */ if ( SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE) { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: SA_SAS_ROUTING_SUBTRACTIVE\n")); /* Setup upstream phys */ dmExpanderUpStreamPhyAdd(dmRoot, oneExpander, (bit8) pDiscoverResp->attachedPhyIdentifier); /* If the expander already has an upsteam device set up */ if (oneExpander->hasUpStreamDevice == agTRUE) { /* just to update MCN */ dmPortSASDeviceFind(dmRoot, onePortContext, attachedSasLo, attachedSasHi, oneDeviceData); /* If the sas address doesn't match */ if ( ((oneExpander->upStreamSASAddressHi != attachedSasHi) || (oneExpander->upStreamSASAddressLo != attachedSasLo)) && (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE || SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) ) { /* TODO: discovery error, callback */ DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: **** Topology Error subtractive routing error - inconsistent SAS address!!!\n")); /* call back to notify discovery error */ onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } else { /* Setup SAS address for up stream device */ oneExpander->hasUpStreamDevice = agTRUE; oneExpander->upStreamSASAddressHi = attachedSasHi; oneExpander->upStreamSASAddressLo = attachedSasLo; if ( (onePortContext->sasLocalAddressHi != attachedSasHi) || (onePortContext->sasLocalAddressLo != attachedSasLo) ) { /* Find the device from the discovered list */ AttachedDevice = dmPortSASDeviceFind(dmRoot, onePortContext, attachedSasLo, attachedSasHi, oneDeviceData); /* If the device has been discovered before */ if ( AttachedDevice != agNULL) { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: Seen This Device Before\n")); /* If attached device is an edge expander */ if ( AttachedDevice->SASSpecDeviceType == SAS_EDGE_EXPANDER_DEVICE) { /* The attached device is an expander */ AttachedExpander = AttachedDevice->dmExpander; /* If the two expanders are the root of the two edge expander sets */ if ( (AttachedExpander->upStreamSASAddressHi == DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo)) && (AttachedExpander->upStreamSASAddressLo == DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo)) ) { /* Setup upstream expander for the pExpander */ oneExpander->dmUpStreamExpander = AttachedExpander; } /* If the two expanders are not the root of the two edge expander sets */ else { /* TODO: loop found, discovery error, callback */ DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: **** Topology Error loop detection!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmUpStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* If attached device is not an edge expander */ else { /*TODO: should not happen, ASSERT */ DM_DBG1(("dmUpStreamDiscover2ExpanderPhy, *** Attached Device is not Edge. Confused!!!\n")); } } /* If the device has not been discovered before */ else { /* Add the device */ DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: New device\n")); /* read minimum rate from the configuration onePortContext->LinkRate is SPC's local link rate */ connectionRate = MIN(onePortContext->LinkRate, SAS2_DISCRSP_GET_LOGICAL_LINKRATE(pDiscoverResp)); DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: link rate 0x%x\n", onePortContext->LinkRate)); DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: negotiatedPhyLinkRate 0x%x\n", SAS2_DISCRSP_GET_LINKRATE(pDiscoverResp))); DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: connectionRate 0x%x\n", connectionRate)); //hhhhhhhh if (SAS2_DISCRSP_IS_STP_TARGET(pDiscoverResp) || SAS2_DISCRSP_IS_SATA_DEVICE(pDiscoverResp)) { /* incremental discovery */ if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } else { /* incremental discovery */ if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } /* If the device is added successfully */ if ( AttachedDevice != agNULL) { /* (3.1.2.3.2.3.2.1) callback about new device */ if ( SAS2_DISCRSP_IS_SSP_TARGET(pDiscoverResp) || SAS2_DISCRSP_IS_SSP_INITIATOR(pDiscoverResp) || SAS2_DISCRSP_IS_SMP_INITIATOR(pDiscoverResp) || SAS2_DISCRSP_IS_SMP_INITIATOR(pDiscoverResp) ) { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: Found SSP/SMP SAS %08x-%08x\n", attachedSasHi, attachedSasLo)); } else { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: Found a SAS STP device.\n")); } /* If the attached device is an expander */ if ( (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) || (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) ) { /* Allocate an expander data structure */ AttachedExpander = dmDiscoveringExpanderAlloc( dmRoot, onePortContext, AttachedDevice ); DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: Found expander=%p\n", AttachedExpander)); /* If allocate successfully */ if ( AttachedExpander != agNULL) { /* Add the pAttachedExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, AttachedExpander); /* Setup upstream expander for the pExpander */ oneExpander->dmUpStreamExpander = AttachedExpander; } /* If failed to allocate */ else { DM_DBG1(("dmUpStreamDiscover2ExpanderPhy, Failed to allocate expander data structure!!!\n")); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* If the attached device is an end device */ else { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: Found end device\n")); /* LP2006-05-26 added upstream device to the newly found device */ AttachedDevice->dmExpander = oneExpander; oneExpander->dmUpStreamExpander = agNULL; } } else { DM_DBG1(("dmUpStreamDiscover2ExpanderPhy, Failed to add a device!!!\n")); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } } } } /* substractive routing */ } } oneExpander->discoveringPhyId ++; if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: DISCOVERY_UP_STREAM find more ...\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } else { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: DISCOVERY_UP_STREAM last phy continue upstream..\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue upstream discovering */ dmUpStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: onePortContext->discovery.status not in DISCOVERY_UP_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG2(("dmUpStreamDiscover2ExpanderPhy: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } osGLOBAL void dmDownStreamDiscoverExpanderPhy( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander, smpRespDiscover_t *pDiscoverResp ) { agsaSASIdentify_t sasIdentify; dmSASSubID_t dmSASSubID; bit32 attachedSasHi, attachedSasLo; dmExpander_t *AttachedExpander; dmExpander_t *UpStreamExpander; dmExpander_t *ConfigurableExpander = agNULL; bit8 connectionRate, negotiatedPhyLinkRate; bit32 configSASAddressHi; bit32 configSASAddressLo; bit32 dupConfigSASAddr = agFALSE; dmDeviceData_t *oneDeviceData; dmDeviceData_t *AttachedDevice = agNULL; bit32 SAS2SAS11Check = agFALSE; dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; DM_DBG3(("dmDownStreamDiscoverExpanderPhy: start\n")); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); DM_ASSERT(dmRoot, "(dmDownStreamDiscoverExpanderPhy) dmRoot NULL"); DM_ASSERT(onePortContext, "(dmDownStreamDiscoverExpanderPhy) pPort NULL"); DM_ASSERT(oneExpander, "(dmDownStreamDiscoverExpanderPhy) pExpander NULL"); DM_ASSERT(pDiscoverResp, "(dmDownStreamDiscoverExpanderPhy) pDiscoverResp NULL"); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: onePortContxt=%p oneExpander=%p\n", onePortContext, oneExpander)); if (dmDiscoverCheck(dmRoot, onePortContext) == agTRUE) { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: invalid port or aborted discovery!!!\n")); return; } if (oneExpander != oneExpander->dmDevice->dmExpander) { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: wrong!!!\n")); } /* (1) Find the device structure of the expander */ oneDeviceData = oneExpander->dmDevice; DM_ASSERT(oneDeviceData, "(dmDownStreamDiscoverExpanderPhy) pDevice NULL"); /* for debugging */ DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Phy #%d of SAS %08x-%08x\n", oneExpander->discoveringPhyId, oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3((" Attached device: %s\n", ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 0 ? "No Device" : (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 1 ? "End Device" : (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 2 ? "Edge Expander" : "Fanout Expander"))))); /* for debugging */ if (oneExpander->discoveringPhyId != pDiscoverResp->phyIdentifier) { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: !!! Incorrect SMP response !!!\n")); DM_DBG1(("dmDownStreamDiscoverExpanderPhy: Request PhyID #%d Response PhyID #%d !!!\n", oneExpander->discoveringPhyId, pDiscoverResp->phyIdentifier)); dmhexdump("NO_DEVICE", (bit8*)pDiscoverResp, sizeof(smpRespDiscover_t)); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { DM_DBG3((" SAS address : %08x-%08x\n", DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp), DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp))); DM_DBG3((" SSP Target : %d\n", DISCRSP_IS_SSP_TARGET(pDiscoverResp)?1:0)); DM_DBG3((" STP Target : %d\n", DISCRSP_IS_STP_TARGET(pDiscoverResp)?1:0)); DM_DBG3((" SMP Target : %d\n", DISCRSP_IS_SMP_TARGET(pDiscoverResp)?1:0)); DM_DBG3((" SATA DEVICE : %d\n", DISCRSP_IS_SATA_DEVICE(pDiscoverResp)?1:0)); DM_DBG3((" SSP Initiator : %d\n", DISCRSP_IS_SSP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG3((" STP Initiator : %d\n", DISCRSP_IS_STP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG3((" SMP Initiator : %d\n", DISCRSP_IS_SMP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG3((" Phy ID : %d\n", pDiscoverResp->phyIdentifier)); DM_DBG3((" Attached Phy ID: %d\n", pDiscoverResp->attachedPhyIdentifier)); } /* end for debugging */ /* saving routing attribute for non self-configuring expanders */ oneExpander->routingAttribute[pDiscoverResp->phyIdentifier] = DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp); oneExpander->discoverSMPAllowed = agTRUE; /* If a device is attached */ if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { /* Setup sasIdentify for the attached device */ sasIdentify.phyIdentifier = pDiscoverResp->phyIdentifier; sasIdentify.deviceType_addressFrameType = pDiscoverResp->attachedDeviceType & 0x70; sasIdentify.initiator_ssp_stp_smp = pDiscoverResp->attached_Ssp_Stp_Smp_Sata_Initiator; sasIdentify.target_ssp_stp_smp = pDiscoverResp->attached_SataPS_Ssp_Stp_Smp_Sata_Target; *(bit32*)sasIdentify.sasAddressHi = *(bit32*)pDiscoverResp->attachedSasAddressHi; *(bit32*)sasIdentify.sasAddressLo = *(bit32*)pDiscoverResp->attachedSasAddressLo; /* incremental discovery */ dmSASSubID.sasAddressHi = SA_IDFRM_GET_SAS_ADDRESSHI(&sasIdentify); dmSASSubID.sasAddressLo = SA_IDFRM_GET_SAS_ADDRESSLO(&sasIdentify); dmSASSubID.initiator_ssp_stp_smp = sasIdentify.initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = sasIdentify.target_ssp_stp_smp; attachedSasHi = DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp); attachedSasLo = DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp); /* If it's a direct routing */ if ( DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_DIRECT) { /* If the attached device is an expander */ if ( (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) || (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) ) { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error direct routing can't connect to expander!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } } /* If the expander's attached device is not myself */ if ( (attachedSasHi != onePortContext->sasLocalAddressHi) || (attachedSasLo != onePortContext->sasLocalAddressLo) ) { /* Find the attached device from discovered list */ AttachedDevice = dmPortSASDeviceFind(dmRoot, onePortContext, attachedSasLo, attachedSasHi, oneDeviceData); /* If the device has not been discovered before */ if ( AttachedDevice == agNULL) //11 { /* If the phy has subtractive routing attribute */ if ( DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE && (DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE || DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) ) { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscoverExpanderPhy: Deferred!!! **** Topology Error subtractive routing error - inconsistent SAS address!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); onePortContext->discovery.DeferredError = agTRUE; } else /* 11 */ { /* Add the device */ /* read minimum rate from the configuration onePortContext->LinkRate is SPC's local link rate */ connectionRate = MIN(onePortContext->LinkRate, DISCRSP_GET_LINKRATE(pDiscoverResp)); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: link rate 0x%x\n", DEVINFO_GET_LINKRATE(&oneDeviceData->agDeviceInfo))); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: negotiatedPhyLinkRate 0x%x\n", DISCRSP_GET_LINKRATE(pDiscoverResp))); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: connectionRate 0x%x\n", connectionRate)); if (DISCRSP_IS_STP_TARGET(pDiscoverResp) || DISCRSP_IS_SATA_DEVICE(pDiscoverResp)) { if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } /* DISCRSP_IS_STP_TARGET(pDiscoverResp) || DISCRSP_IS_SATA_DEVICE(pDiscoverResp) */ else /* 22 */ { if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } /* else 22 */ DM_DBG3(("dmDownStreamDiscoverExpanderPhy: newDevice pDevice=%p\n", AttachedDevice)); /* If the device is added successfully */ if ( AttachedDevice != agNULL) { if ( SA_IDFRM_IS_SSP_TARGET(&sasIdentify) || SA_IDFRM_IS_SMP_TARGET(&sasIdentify) || SA_IDFRM_IS_SSP_INITIATOR(&sasIdentify) || SA_IDFRM_IS_SMP_INITIATOR(&sasIdentify) ) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Report a new SAS device !!\n")); } else { if ( SA_IDFRM_IS_STP_TARGET(&sasIdentify) || SA_IDFRM_IS_SATA_DEVICE(&sasIdentify) ) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Found an STP or SATA device.\n")); } else { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Found Other type of device.\n")); } } /* LP2006-05-26 added upstream device to the newly found device */ AttachedDevice->dmExpander = oneExpander; DM_DBG3(("dmDownStreamDiscoverExpanderPhy: AttachedDevice %p did %d\n", AttachedDevice, AttachedDevice->id)); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Attached oneExpander %p did %d\n", AttachedDevice->dmExpander, AttachedDevice->dmExpander->id)); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: oneDeviceData %p did %d\n", oneDeviceData, oneDeviceData->id)); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: oneExpander %p did %d\n", oneDeviceData->dmExpander, oneDeviceData->dmExpander->id)); /* If the phy has table routing attribute */ if ( DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_TABLE) { /* If the attached device is a fan out expander */ if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error two table routing phys are connected!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } else if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) { /* Allocate an expander data structure */ AttachedExpander = dmDiscoveringExpanderAlloc(dmRoot, onePortContext, AttachedDevice); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Found a EDGE exp device.%p\n", AttachedExpander)); /* If allocate successfully */ if ( AttachedExpander != agNULL) { /* set up downstream information on configurable expander */ dmExpanderDownStreamPhyAdd(dmRoot, oneExpander, (bit8) oneExpander->discoveringPhyId); /* Setup upstream information */ dmExpanderUpStreamPhyAdd(dmRoot, AttachedExpander, (bit8) oneExpander->discoveringPhyId); AttachedExpander->hasUpStreamDevice = agTRUE; AttachedExpander->upStreamSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); AttachedExpander->upStreamSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); AttachedExpander->dmUpStreamExpander = oneExpander; /* (2.3.2.2.2.2.2.2.2) Add the pAttachedExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, AttachedExpander); } /* If failed to allocate */ else { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: Failed to allocate expander data structure!!!\n")); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } } /* DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_TABLE */ /* If status is still DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 1st before\n")); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); UpStreamExpander = oneExpander->dmUpStreamExpander; ConfigurableExpander = dmFindConfigurableExp(dmRoot, onePortContext, oneExpander); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); if (ConfigurableExpander) { if ( (ConfigurableExpander->dmDevice->SASAddressID.sasAddressHi == DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo)) && (ConfigurableExpander->dmDevice->SASAddressID.sasAddressLo == DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo)) ) { /* directly attached between oneExpander and ConfigurableExpander */ DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 1st before loc 1\n")); configSASAddressHi = oneExpander->dmDevice->SASAddressID.sasAddressHi; configSASAddressLo = oneExpander->dmDevice->SASAddressID.sasAddressLo; } else { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 1st before loc 2\n")); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); } } /* if !ConfigurableExpander */ dupConfigSASAddr = dmDuplicateConfigSASAddr(dmRoot, ConfigurableExpander, configSASAddressHi, configSASAddressLo ); if ( ConfigurableExpander && dupConfigSASAddr == agFALSE) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 1st q123\n")); UpStreamExpander->dmCurrentDownStreamExpander = oneExpander; ConfigurableExpander->currentDownStreamPhyIndex = dmFindCurrentDownStreamPhyIndex(dmRoot, ConfigurableExpander); ConfigurableExpander->dmReturnginExpander = oneExpander; dmRoutingEntryAdd(dmRoot, ConfigurableExpander, ConfigurableExpander->downStreamPhys[ConfigurableExpander->currentDownStreamPhyIndex], configSASAddressHi, configSASAddressLo ); } } /* onePortContext->discovery.status == DISCOVERY_DOWN_STREAM */ } /* AttachedDevice != agNULL */ /* If fail to add the device */ else { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: Failed to add a device!!!\n")); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* else 11 */ } /* AttachedDevice == agNULL */ /* If the device has been discovered before */ else /* haha discovered before 33 */ { /* If the phy has subtractive routing attribute */ if ( DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE) { /* If the expander doesn't have up stream device */ if ( oneExpander->hasUpStreamDevice == agFALSE) { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error loop, or end device connects to two expanders!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } /* If the expander has up stream device */ else /* 44 */ { /* If sas address doesn't match */ if ( (oneExpander->upStreamSASAddressHi != attachedSasHi) || (oneExpander->upStreamSASAddressLo != attachedSasLo) ) { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error two subtractive phys!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* else 44 */ } /* DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE */ /* If the phy has table routing attribute */ else if ( DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_TABLE) { /* If the attached device is a fan out expander */ if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) { /* (2.3.3.2.1.1) TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error fan out expander to routing table phy!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } /* If the attached device is an edge expander */ else if ( DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) { /* Setup up stream inform */ AttachedExpander = AttachedDevice->dmExpander; DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Found edge expander=%p\n", AttachedExpander)); /* If the attached expander has up stream device */ if ( AttachedExpander->hasUpStreamDevice == agTRUE) { /* compare the sas address */ if ( (AttachedExpander->upStreamSASAddressHi != DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo)) || (AttachedExpander->upStreamSASAddressLo != DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo))) { /* TODO: discovery error, callback */ SAS2SAS11Check = dmSAS2SAS11ErrorCheck(dmRoot, onePortContext, AttachedExpander, oneExpander, oneExpander); if (SAS2SAS11Check == agTRUE) { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error SAS2 and SAS1.1!!!\n")); } else { DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error two table routing phys connected (1)!!!\n")); } onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } else { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Add edge expander=%p\n", AttachedExpander)); /* set up downstream information on configurable expander */ dmExpanderDownStreamPhyAdd(dmRoot, oneExpander, (bit8) oneExpander->discoveringPhyId); /* haha */ dmExpanderUpStreamPhyAdd(dmRoot, AttachedExpander, (bit8) oneExpander->discoveringPhyId); /* Add the pAttachedExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, AttachedExpander); } } /* AttachedExpander->hasUpStreamDevice == agTRUE */ /* If the attached expander doesn't have up stream device */ else { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscoverExpanderPhy: **** Topology Error two table routing phys connected (2)!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscoverExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE */ } /* DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_TABLE */ /* do this regradless of sub or table */ /* If status is still DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 2nd before\n")); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); UpStreamExpander = oneExpander->dmUpStreamExpander; ConfigurableExpander = dmFindConfigurableExp(dmRoot, onePortContext, oneExpander); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); if (ConfigurableExpander) { if ( (ConfigurableExpander->dmDevice->SASAddressID.sasAddressHi == DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo)) && (ConfigurableExpander->dmDevice->SASAddressID.sasAddressLo == DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo)) ) { /* directly attached between oneExpander and ConfigurableExpander */ DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 2nd before loc 1\n")); configSASAddressHi = oneExpander->dmDevice->SASAddressID.sasAddressHi; configSASAddressLo = oneExpander->dmDevice->SASAddressID.sasAddressLo; } else { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 2nd before loc 2\n")); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); } } /* if !ConfigurableExpander */ dupConfigSASAddr = dmDuplicateConfigSASAddr(dmRoot, ConfigurableExpander, configSASAddressHi, configSASAddressLo ); if ( ConfigurableExpander && dupConfigSASAddr == agFALSE) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 2nd q123 \n")); UpStreamExpander->dmCurrentDownStreamExpander = oneExpander; ConfigurableExpander->currentDownStreamPhyIndex = dmFindCurrentDownStreamPhyIndex(dmRoot, ConfigurableExpander); ConfigurableExpander->dmReturnginExpander = oneExpander; dmRoutingEntryAdd(dmRoot, ConfigurableExpander, ConfigurableExpander->downStreamPhys[ConfigurableExpander->currentDownStreamPhyIndex], configSASAddressHi, configSASAddressLo ); } } /* onePortContext->discovery.status == DISCOVERY_DOWN_STREAM */ /* incremental discovery */ if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_INCREMENTAL_START) { connectionRate = MIN(onePortContext->LinkRate, DISCRSP_GET_LINKRATE(pDiscoverResp)); if (DISCRSP_IS_STP_TARGET(pDiscoverResp) || DISCRSP_IS_SATA_DEVICE(pDiscoverResp)) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: incremental SATA_STP\n")); dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: incremental SAS\n")); dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } /* onePortContext->discovery.type == DM_DISCOVERY_OPTION_INCREMENTAL_START */ } /* else 33 */ } /* (attachedSasLo != onePortContext->sasLocalAddressLo) */ else /* else 44 */ { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: Found Self\n")); DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 3rd before\n")); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); UpStreamExpander = oneExpander->dmUpStreamExpander; ConfigurableExpander = dmFindConfigurableExp(dmRoot, onePortContext, oneExpander); dupConfigSASAddr = dmDuplicateConfigSASAddr(dmRoot, ConfigurableExpander, onePortContext->sasLocalAddressHi, onePortContext->sasLocalAddressLo ); if ( ConfigurableExpander && dupConfigSASAddr == agFALSE) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: 3rd q123 Setup routing table\n")); UpStreamExpander->dmCurrentDownStreamExpander = oneExpander; ConfigurableExpander->currentDownStreamPhyIndex = dmFindCurrentDownStreamPhyIndex(dmRoot, ConfigurableExpander); ConfigurableExpander->dmReturnginExpander = oneExpander; dmRoutingEntryAdd(dmRoot, ConfigurableExpander, ConfigurableExpander->downStreamPhys[ConfigurableExpander->currentDownStreamPhyIndex], onePortContext->sasLocalAddressHi, onePortContext->sasLocalAddressLo ); } } /* else 44 */ } /* DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE */ /* If no device is attached */ else { DM_DBG2(("!!!!!!!!!!!!!!!!!!!!! SPIN SATA !!!!!!!!!!!!!!!!!!!!!!!!!!!\n")); negotiatedPhyLinkRate = DISCRSP_GET_LINKRATE(pDiscoverResp); // added by thenil if (negotiatedPhyLinkRate == 0x03) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: SPIN SATA sent reset\n")); dmPhyControlSend(dmRoot, oneDeviceData, SMP_PHY_CONTROL_HARD_RESET, pDiscoverResp->phyIdentifier ); } /* do nothing */ } /* Increment the discovering phy id */ oneExpander->discoveringPhyId ++; /* If the discovery status is DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM ) { /* If not the last phy */ if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: More Phys to discover\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } /* If the last phy */ else { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: No More Phys\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue downstream discovering */ dmDownStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG3(("dmDownStreamDiscoverExpanderPhy: onePortContext->discovery.status not in DISCOVERY_DOWN_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG3(("dmDownStreamDiscoverExpanderPhy: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } /* works at SAS2 expander (called in dmDownStreamDiscover2ExpanderPhy()) if currentExpander is SAS2, called in dmDownStreamDiscover2ExpanderPhy() if currentExpander is SAS1.1, called in dmDownStreamDiscoverExpanderPhy() */ osGLOBAL bit32 dmSAS2SAS11ErrorCheck( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *topExpander, dmExpander_t *bottomExpander, dmExpander_t *currentExpander ) { bit32 result = agFALSE, i = 0; bit8 downStreamPhyID, upStreamPhyID; DM_DBG2(("dmSAS2SAS11ErrorCheck: start\n")); if (topExpander == agNULL) { DM_DBG2(("dmSAS2SAS11ErrorCheck: topExpander is NULL\n")); return result; } if (bottomExpander == agNULL) { DM_DBG2(("dmSAS2SAS11ErrorCheck: bottomExpander is NULL\n")); return result; } if (currentExpander == agNULL) { DM_DBG2(("dmSAS2SAS11ErrorCheck: currentExpander is NULL\n")); return result; } DM_DBG2(("dmSAS2SAS11ErrorCheck: topExpander addrHi 0x%08x addrLo 0x%08x\n", topExpander->dmDevice->SASAddressID.sasAddressHi, topExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG2(("dmSAS2SAS11ErrorCheck: bottomExpander addrHi 0x%08x addrLo 0x%08x\n", bottomExpander->dmDevice->SASAddressID.sasAddressHi, bottomExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG2(("dmSAS2SAS11ErrorCheck: currentExpander addrHi 0x%08x addrLo 0x%08x\n", currentExpander->dmDevice->SASAddressID.sasAddressHi, currentExpander->dmDevice->SASAddressID.sasAddressLo)); for (i=0;idownStreamPhys[i]; upStreamPhyID = bottomExpander->upStreamPhys[i]; if (currentExpander->SAS2 == 1) { if ( downStreamPhyID == upStreamPhyID && topExpander->routingAttribute[downStreamPhyID] == SAS_ROUTING_TABLE && bottomExpander->routingAttribute[i] == SAS_ROUTING_SUBTRACTIVE && topExpander->SAS2 == 0 && bottomExpander->SAS2 == 1 ) { result = agTRUE; break; } } else if (currentExpander->SAS2 == 0) { if ( downStreamPhyID == upStreamPhyID && topExpander->routingAttribute[downStreamPhyID] == SAS_ROUTING_SUBTRACTIVE && bottomExpander->routingAttribute[i] == SAS_ROUTING_TABLE && topExpander->SAS2 == 1 && bottomExpander->SAS2 == 0 ) { result = agTRUE; break; } } } return result; } osGLOBAL void dmDownStreamDiscover2ExpanderPhy( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander, smpRespDiscover2_t *pDiscoverResp ) { dmDeviceData_t *oneDeviceData; dmExpander_t *UpStreamExpander; dmDeviceData_t *AttachedDevice = agNULL; dmExpander_t *AttachedExpander; agsaSASIdentify_t sasIdentify; bit8 connectionRate; bit32 attachedSasHi, attachedSasLo; dmSASSubID_t dmSASSubID; dmExpander_t *ConfigurableExpander = agNULL; bit32 dupConfigSASAddr = agFALSE; bit32 configSASAddressHi; bit32 configSASAddressLo; bit32 SAS2SAS11Check = agFALSE; dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: start\n")); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); DM_ASSERT(dmRoot, "(dmDownStreamDiscover2ExpanderPhy) dmRoot NULL"); DM_ASSERT(onePortContext, "(dmDownStreamDiscover2ExpanderPhy) pPort NULL"); DM_ASSERT(oneExpander, "(dmDownStreamDiscover2ExpanderPhy) pExpander NULL"); DM_ASSERT(pDiscoverResp, "(dmDownStreamDiscover2ExpanderPhy) pDiscoverResp NULL"); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: onePortContxt=%p oneExpander=%p oneDeviceData=%p\n", onePortContext, oneExpander, oneExpander->dmDevice)); if (dmDiscoverCheck(dmRoot, onePortContext) == agTRUE) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: invalid port or aborted discovery!!!\n")); return; } if (oneExpander != oneExpander->dmDevice->dmExpander) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: wrong!!!\n")); } /* (1) Find the device structure of the expander */ oneDeviceData = oneExpander->dmDevice; DM_ASSERT(oneDeviceData, "(dmDownStreamDiscover2ExpanderPhy) pDevice NULL"); /* for debugging */ DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Phy #%d of SAS %08x-%08x\n", oneExpander->discoveringPhyId, oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG2((" Attached device: %s\n", ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 0 ? "No Device" : (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 1 ? "End Device" : (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == 2 ? "Edge Expander" : "Fanout Expander"))))); /* for debugging */ if (oneExpander->discoveringPhyId != pDiscoverResp->phyIdentifier) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: !!! Incorrect SMP response !!!\n")); DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: Request PhyID #%d Response PhyID #%d\n", oneExpander->discoveringPhyId, pDiscoverResp->phyIdentifier)); dmhexdump("NO_DEVICE", (bit8*)pDiscoverResp, sizeof(smpRespDiscover2_t)); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { DM_DBG2((" SAS address : %08x-%08x\n", SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp), SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp))); DM_DBG2((" SSP Target : %d\n", SAS2_DISCRSP_IS_SSP_TARGET(pDiscoverResp)?1:0)); DM_DBG2((" STP Target : %d\n", SAS2_DISCRSP_IS_STP_TARGET(pDiscoverResp)?1:0)); DM_DBG2((" SMP Target : %d\n", SAS2_DISCRSP_IS_SMP_TARGET(pDiscoverResp)?1:0)); DM_DBG2((" SATA DEVICE : %d\n", SAS2_DISCRSP_IS_SATA_DEVICE(pDiscoverResp)?1:0)); DM_DBG2((" SSP Initiator : %d\n", SAS2_DISCRSP_IS_SSP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG2((" STP Initiator : %d\n", SAS2_DISCRSP_IS_STP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG2((" SMP Initiator : %d\n", SAS2_DISCRSP_IS_SMP_INITIATOR(pDiscoverResp)?1:0)); DM_DBG2((" Phy ID : %d\n", pDiscoverResp->phyIdentifier)); DM_DBG2((" Attached Phy ID: %d\n", pDiscoverResp->attachedPhyIdentifier)); } /* saving routing attribute for non self-configuring expanders */ oneExpander->routingAttribute[pDiscoverResp->phyIdentifier] = SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp); oneExpander->discoverSMPAllowed = agTRUE; /* If a device is attached */ if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) != SAS_NO_DEVICE) { /* Setup sasIdentify for the attached device */ sasIdentify.phyIdentifier = pDiscoverResp->phyIdentifier; sasIdentify.deviceType_addressFrameType = pDiscoverResp->attachedDeviceTypeReason & 0x70; sasIdentify.initiator_ssp_stp_smp = pDiscoverResp->attached_Ssp_Stp_Smp_Sata_Initiator; sasIdentify.target_ssp_stp_smp = pDiscoverResp->attached_SataPS_Ssp_Stp_Smp_Sata_Target; *(bit32*)sasIdentify.sasAddressHi = *(bit32*)pDiscoverResp->attachedSasAddressHi; *(bit32*)sasIdentify.sasAddressLo = *(bit32*)pDiscoverResp->attachedSasAddressLo; /* incremental discovery */ dmSASSubID.sasAddressHi = SA_IDFRM_GET_SAS_ADDRESSHI(&sasIdentify); dmSASSubID.sasAddressLo = SA_IDFRM_GET_SAS_ADDRESSLO(&sasIdentify); dmSASSubID.initiator_ssp_stp_smp = sasIdentify.initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = sasIdentify.target_ssp_stp_smp; attachedSasHi = SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSHI(pDiscoverResp); attachedSasLo = SAS2_DISCRSP_GET_ATTACHED_SAS_ADDRESSLO(pDiscoverResp); /* If it's a direct routing */ if ( SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_DIRECT) { /* If the attached device is an expander */ if ( (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) || (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) ) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error direct routing can't connect to expander!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } } /* If the expander's attached device is not myself */ if ( (attachedSasHi != onePortContext->sasLocalAddressHi) || (attachedSasLo != onePortContext->sasLocalAddressLo) ) { /* Find the attached device from discovered list */ AttachedDevice = dmPortSASDeviceFind(dmRoot, onePortContext, attachedSasLo, attachedSasHi, oneDeviceData); /* If the device has not been discovered before */ if ( AttachedDevice == agNULL) //11 { //qqqqqq if (0) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error subtractive routing error - inconsistent SAS address!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } else { /* Add the device */ /* read minimum rate from the configuration onePortContext->LinkRate is SPC's local link rate */ connectionRate = MIN(onePortContext->LinkRate, SAS2_DISCRSP_GET_LOGICAL_LINKRATE(pDiscoverResp)); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: link rate 0x%x\n", DEVINFO_GET_LINKRATE(&oneDeviceData->agDeviceInfo))); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: negotiatedPhyLinkRate 0x%x\n", SAS2_DISCRSP_GET_LINKRATE(pDiscoverResp))); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: connectionRate 0x%x\n", connectionRate)); if (SAS2_DISCRSP_IS_STP_TARGET(pDiscoverResp) || SAS2_DISCRSP_IS_SATA_DEVICE(pDiscoverResp)) { if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } else { if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { /* incremental discovery */ AttachedDevice = dmFindRegNValid( dmRoot, onePortContext, &dmSASSubID ); /* not registered and not valid; add this*/ if (AttachedDevice == agNULL) { AttachedDevice = dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } } DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: newDevice pDevice=%p\n", AttachedDevice)); /* If the device is added successfully */ if ( AttachedDevice != agNULL) { if ( SA_IDFRM_IS_SSP_TARGET(&sasIdentify) || SA_IDFRM_IS_SMP_TARGET(&sasIdentify) || SA_IDFRM_IS_SSP_INITIATOR(&sasIdentify) || SA_IDFRM_IS_SMP_INITIATOR(&sasIdentify) ) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Report a new SAS device !!\n")); } else { if ( SA_IDFRM_IS_STP_TARGET(&sasIdentify) || SA_IDFRM_IS_SATA_DEVICE(&sasIdentify) ) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Found an STP or SATA device.\n")); } else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Found Other type of device.\n")); } } /* LP2006-05-26 added upstream device to the newly found device */ AttachedDevice->dmExpander = oneExpander; DM_DBG3(("dmDownStreamDiscover2ExpanderPhy: AttachedDevice %p did %d\n", AttachedDevice, AttachedDevice->id)); DM_DBG3(("dmDownStreamDiscover2ExpanderPhy: Attached oneExpander %p did %d\n", AttachedDevice->dmExpander, AttachedDevice->dmExpander->id)); DM_DBG3(("dmDownStreamDiscover2ExpanderPhy: oneDeviceData %p did %d\n", oneDeviceData, oneDeviceData->id)); DM_DBG3(("dmDownStreamDiscover2ExpanderPhy: oneExpander %p did %d\n", oneDeviceData->dmExpander, oneDeviceData->dmExpander->id)); /* If the phy has table routing attribute */ if ( SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_TABLE) { /* If the attached device is a fan out expander */ if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error two table routing phys are connected!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } else if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) { /* Allocate an expander data structure */ AttachedExpander = dmDiscoveringExpanderAlloc(dmRoot, onePortContext, AttachedDevice); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Found a EDGE exp device.%p\n", AttachedExpander)); /* If allocate successfully */ if ( AttachedExpander != agNULL) { /* set up downstream information on configurable expander */ dmExpanderDownStreamPhyAdd(dmRoot, oneExpander, (bit8) oneExpander->discoveringPhyId); /* Setup upstream information */ dmExpanderUpStreamPhyAdd(dmRoot, AttachedExpander, (bit8) oneExpander->discoveringPhyId); //qqqqq AttachedExpander->hasUpStreamDevice = agTRUE; AttachedExpander->upStreamSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); AttachedExpander->upStreamSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); AttachedExpander->dmUpStreamExpander = oneExpander; /* (2.3.2.2.2.2.2.2.2) Add the pAttachedExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, AttachedExpander); } /* If failed to allocate */ else { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy, Failed to allocate expander data structure!!!\n")); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } } //qqqqq else if ( SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE && (SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE || SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) ) { /* Allocate an expander data structure */ AttachedExpander = dmDiscoveringExpanderAlloc(dmRoot, onePortContext, AttachedDevice); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Found a EDGE/FANOUT exp device.%p\n", AttachedExpander)); /* If allocate successfully */ if ( AttachedExpander != agNULL) { /* set up downstream information on configurable expander */ dmExpanderDownStreamPhyAdd(dmRoot, oneExpander, (bit8) oneExpander->discoveringPhyId); /* Setup upstream information */ dmExpanderUpStreamPhyAdd(dmRoot, AttachedExpander, (bit8) oneExpander->discoveringPhyId); AttachedExpander->hasUpStreamDevice = agTRUE; AttachedExpander->upStreamSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); AttachedExpander->upStreamSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); AttachedExpander->dmUpStreamExpander = oneExpander; /* (2.3.2.2.2.2.2.2.2) Add the pAttachedExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, AttachedExpander); } /* If failed to allocate */ else { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy, Failed to allocate expander data structure (2)!!!\n")); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } /* If status is still DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM && onePortContext->discovery.ConfiguresOthers == agFALSE) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 1st before\n")); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); UpStreamExpander = oneExpander->dmUpStreamExpander; ConfigurableExpander = dmFindConfigurableExp(dmRoot, onePortContext, oneExpander); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); if (ConfigurableExpander) { if ( (ConfigurableExpander->dmDevice->SASAddressID.sasAddressHi == DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo)) && (ConfigurableExpander->dmDevice->SASAddressID.sasAddressLo == DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo)) ) { /* directly attached between oneExpander and ConfigurableExpander */ DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 1st before loc 1\n")); configSASAddressHi = oneExpander->dmDevice->SASAddressID.sasAddressHi; configSASAddressLo = oneExpander->dmDevice->SASAddressID.sasAddressLo; } else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 1st before loc 2\n")); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); } } /* if !ConfigurableExpander */ dupConfigSASAddr = dmDuplicateConfigSASAddr(dmRoot, ConfigurableExpander, configSASAddressHi, configSASAddressLo ); if ( ConfigurableExpander && dupConfigSASAddr == agFALSE) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 1st q123\n")); UpStreamExpander->dmCurrentDownStreamExpander = oneExpander; ConfigurableExpander->currentDownStreamPhyIndex = dmFindCurrentDownStreamPhyIndex(dmRoot, ConfigurableExpander); ConfigurableExpander->dmReturnginExpander = oneExpander; dmRoutingEntryAdd(dmRoot, ConfigurableExpander, ConfigurableExpander->downStreamPhys[ConfigurableExpander->currentDownStreamPhyIndex], configSASAddressHi, configSASAddressLo ); } } } /* If fail to add the device */ else { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy, Failed to add a device!!!\n")); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } } /* If the device has been discovered before */ else /* discovered before */ { /* If the phy has subtractive routing attribute */ if ( SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_SUBTRACTIVE) { /* If the expander doesn't have up stream device */ if ( oneExpander->hasUpStreamDevice == agFALSE) { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error loop, or end device connects to two expanders!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } /* If the expander has up stream device */ else { //qqqqq /* If sas address doesn't match */ if ( (oneExpander->upStreamSASAddressHi != attachedSasHi) || (oneExpander->upStreamSASAddressLo != attachedSasLo) ) { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** two subtractive phys!!! Allowed in SAS2!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; onePortContext->discovery.DeferredError = agTRUE; } } } /* If the phy has table routing attribute */ else if ( SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_TABLE) { /* If the attached device is a fan out expander */ if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_FANOUT_EXPANDER_DEVICE) { /* (2.3.3.2.1.1) TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error fan out expander to routing table phy!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } /* If the attached device is an edge expander */ else if ( SAS2_DISCRSP_GET_ATTACHED_DEVTYPE(pDiscoverResp) == SAS_EDGE_EXPANDER_DEVICE) { /* Setup up stream inform */ AttachedExpander = AttachedDevice->dmExpander; DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Found edge expander=%p\n", AttachedExpander)); //hhhhhh /* If the attached expander has up stream device */ if ( AttachedExpander->hasUpStreamDevice == agTRUE) { /* compare the sas address */ if ( (AttachedExpander->upStreamSASAddressHi != DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo)) || (AttachedExpander->upStreamSASAddressLo != DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo))) { if (AttachedExpander->TTTSupported && oneExpander->TTTSupported) { /* needs further error checking UpstreamExpanderOfAttachedExpander = AttachedExpander->UpStreamExpander for (i=0;idownStreamPhys[i] != 0 && } */ SAS2SAS11Check = dmSAS2SAS11ErrorCheck(dmRoot, onePortContext, AttachedExpander->dmUpStreamExpander, AttachedExpander, oneExpander); if (SAS2SAS11Check == agTRUE) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error SAS2 and SAS1.1!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } else { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: Allowed Table to Table (1)\n")); /* move on to the next phys but should be not proceed after oneExpander */ oneExpander->UndoDueToTTTSupported = agTRUE; onePortContext->discovery.DeferredError = agFALSE; } } else { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error two table routing phys connected (1)!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Add edge expander=%p\n", AttachedExpander)); /* set up downstream information on configurable expander */ dmExpanderDownStreamPhyAdd(dmRoot, oneExpander, (bit8) oneExpander->discoveringPhyId); /* haha */ dmExpanderUpStreamPhyAdd(dmRoot, AttachedExpander, (bit8) oneExpander->discoveringPhyId); /* Add the pAttachedExpander to discovering list */ dmDiscoveringExpanderAdd(dmRoot, onePortContext, AttachedExpander); } } /* If the attached expander doesn't have up stream device */ else { if (AttachedExpander->TTTSupported && oneExpander->TTTSupported) { DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: Allowed Table to Table (2)\n")); /* move on to the next phys but should be not proceed after oneExpander */ oneExpander->UndoDueToTTTSupported = agTRUE; onePortContext->discovery.DeferredError = agFALSE; } else { /* TODO: discovery error, callback */ DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: **** Topology Error two table routing phys connected (2)!!!\n")); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&oneDeviceData->agDeviceInfo); onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier = oneExpander->discoveringPhyId; DM_DBG1(("dmDownStreamDiscover2ExpanderPhy: sasAddressHi 0x%08x sasAddressLo 0x%08x phyid 0x%x\n", onePortContext->discovery.sasAddressIDDiscoverError.sasAddressHi, onePortContext->discovery.sasAddressIDDiscoverError.sasAddressLo, onePortContext->discovery.sasAddressIDDiscoverError.phyIdentifier)); /* discovery done */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } } } } /* for else if (SAS2_DISCRSP_GET_ROUTINGATTRIB(pDiscoverResp) == SAS_ROUTING_TABLE) */ /* do this regradless of sub or table */ /* If status is still DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM && onePortContext->discovery.ConfiguresOthers == agFALSE) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 2nd before\n")); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); UpStreamExpander = oneExpander->dmUpStreamExpander; ConfigurableExpander = dmFindConfigurableExp(dmRoot, onePortContext, oneExpander); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); if (ConfigurableExpander) { if ( (ConfigurableExpander->dmDevice->SASAddressID.sasAddressHi == DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo)) && (ConfigurableExpander->dmDevice->SASAddressID.sasAddressLo == DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo)) ) { /* directly attached between oneExpander and ConfigurableExpander */ DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 2nd before loc 1\n")); configSASAddressHi = oneExpander->dmDevice->SASAddressID.sasAddressHi; configSASAddressLo = oneExpander->dmDevice->SASAddressID.sasAddressLo; } else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 2nd before loc 2\n")); configSASAddressHi = DEVINFO_GET_SAS_ADDRESSHI(&AttachedDevice->agDeviceInfo); configSASAddressLo = DEVINFO_GET_SAS_ADDRESSLO(&AttachedDevice->agDeviceInfo); } } /* if !ConfigurableExpander */ dupConfigSASAddr = dmDuplicateConfigSASAddr(dmRoot, ConfigurableExpander, configSASAddressHi, configSASAddressLo ); if ( ConfigurableExpander && dupConfigSASAddr == agFALSE) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 2nd q123 \n")); UpStreamExpander->dmCurrentDownStreamExpander = oneExpander; ConfigurableExpander->currentDownStreamPhyIndex = dmFindCurrentDownStreamPhyIndex(dmRoot, ConfigurableExpander); ConfigurableExpander->dmReturnginExpander = oneExpander; dmRoutingEntryAdd(dmRoot, ConfigurableExpander, ConfigurableExpander->downStreamPhys[ConfigurableExpander->currentDownStreamPhyIndex], configSASAddressHi, configSASAddressLo ); } } /* if (onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) */ /* incremental discovery */ if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_INCREMENTAL_START) { connectionRate = MIN(onePortContext->LinkRate, SAS2_DISCRSP_GET_LOGICAL_LINKRATE(pDiscoverResp)); if (SAS2_DISCRSP_IS_STP_TARGET(pDiscoverResp) || SAS2_DISCRSP_IS_SATA_DEVICE(pDiscoverResp)) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: incremental SATA_STP\n")); dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, STP_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: incremental SAS\n")); dmPortSASDeviceAdd( dmRoot, onePortContext, sasIdentify, agFALSE, connectionRate, dmAllShared->itNexusTimeout, 0, SAS_DEVICE_TYPE, oneDeviceData, oneExpander, pDiscoverResp->phyIdentifier ); } } }/* else; existing devce */ } /* not attached to myself */ /* If the attached device is myself */ else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Found Self\n")); DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 3rd before\n")); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); if (onePortContext->discovery.ConfiguresOthers == agFALSE) { UpStreamExpander = oneExpander->dmUpStreamExpander; ConfigurableExpander = dmFindConfigurableExp(dmRoot, onePortContext, oneExpander); dupConfigSASAddr = dmDuplicateConfigSASAddr(dmRoot, ConfigurableExpander, onePortContext->sasLocalAddressHi, onePortContext->sasLocalAddressLo ); if ( ConfigurableExpander && dupConfigSASAddr == agFALSE) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: 3rd q123 Setup routing table\n")); UpStreamExpander->dmCurrentDownStreamExpander = oneExpander; ConfigurableExpander->currentDownStreamPhyIndex = dmFindCurrentDownStreamPhyIndex(dmRoot, ConfigurableExpander); ConfigurableExpander->dmReturnginExpander = oneExpander; dmRoutingEntryAdd(dmRoot, ConfigurableExpander, ConfigurableExpander->downStreamPhys[ConfigurableExpander->currentDownStreamPhyIndex], onePortContext->sasLocalAddressHi, onePortContext->sasLocalAddressLo ); } } } } /* If no device is attached */ else { } /* Increment the discovering phy id */ oneExpander->discoveringPhyId ++; /* If the discovery status is DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM ) { /* If not the last phy */ if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: More Phys to discover\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } /* If the last phy */ else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: No More Phys\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); ConfigurableExpander = dmFindConfigurableExp(dmRoot, onePortContext, oneExpander); if (oneExpander->UndoDueToTTTSupported == agTRUE && ConfigurableExpander != agNULL) // if (oneExpander->UndoDueToTTTSupported == agTRUE) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: Not sure!!!\n")); dmDiscoveringUndoAdd(dmRoot, onePortContext, oneExpander); oneExpander->UndoDueToTTTSupported = agFALSE; } /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue downstream discovering */ dmDownStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: onePortContext->discovery.status not in DISCOVERY_DOWN_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG2(("dmDownStreamDiscover2ExpanderPhy: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } osGLOBAL void dmDiscoveringUndoAdd( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmList_t *ExpanderList; dmExpander_t *tempExpander; dmIntPortContext_t *tmpOnePortContext = onePortContext; DM_DBG2(("dmDiscoveringUndoAdd: start\n")); if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.discoveringExpanderList))) { DM_DBG2(("dmDiscoveringUndoAdd: empty discoveringExpanderList\n")); return; } // DM_DBG2(("dmDiscoveringUndoAdd: before\n")); // dmDumpAllExp(dmRoot, onePortContext, oneExpander); ExpanderList = tmpOnePortContext->discovery.discoveringExpanderList.flink; while (ExpanderList != &(tmpOnePortContext->discovery.discoveringExpanderList)) { tempExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if ( tempExpander == agNULL) { DM_DBG1(("dmDiscoveringUndoAdd: tempExpander is NULL!!!\n")); return; } if (tempExpander->dmUpStreamExpander == oneExpander) { DM_DBG2(("dmDiscoveringUndoAdd: match!!! expander id %d\n", tempExpander->id)); DM_DBG2(("dmDiscoveringUndoAdd: exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG2(("dmDiscoveringUndoAdd: exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_DEQUEUE_THIS(&(tempExpander->linkNode)); // DMLIST_ENQUEUE_AT_TAIL(&(tempExpander->linkNode), &(dmAllShared->freeExpanderList)); DMLIST_ENQUEUE_AT_TAIL(&(tempExpander->linkNode), &(dmAllShared->mainExpanderList)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); ExpanderList = tmpOnePortContext->discovery.discoveringExpanderList.flink; } if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.discoveringExpanderList))) { DM_DBG2(("dmDiscoveringUndoAdd: hitting break\n")); break; } ExpanderList = ExpanderList->flink; } // DM_DBG2(("dmDiscoveringUndoAdd: after\n")); // dmDumpAllExp(dmRoot, onePortContext, oneExpander); return; } osGLOBAL void dmHandleZoneViolation( dmRoot_t *dmRoot, agsaRoot_t *agRoot, agsaIORequest_t *agIORequest, dmDeviceData_t *oneDeviceData, dmSMPFrameHeader_t *frameHeader, agsaFrameHandle_t frameHandle ) { dmIntPortContext_t *onePortContext = agNULL; dmExpander_t *oneExpander = agNULL; DM_DBG1(("dmHandleZoneViolation: start\n")); DM_DBG1(("dmHandleZoneViolation: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG1(("dmHandleZoneViolation: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); onePortContext = oneDeviceData->dmPortContext; oneExpander = oneDeviceData->dmExpander; if (dmDiscoverCheck(dmRoot, onePortContext) == agTRUE) { DM_DBG1(("dmHandleZoneViolation: invalid port or aborted discovery!!!\n")); return; } /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); if ( onePortContext->discovery.status == DISCOVERY_UP_STREAM) { /* continue upstream discovering */ dmUpStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } else /* DISCOVERY_DOWN_STREAM or DISCOVERY_CONFIG_ROUTING */ { /* continue downstream discovering */ dmDownStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } return; } osGLOBAL void dmUpStreamDiscoverExpanderPhySkip( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmDeviceData_t *oneDeviceData; DM_DBG3(("dmUpStreamDiscoverExpanderPhySkip: start\n")); oneDeviceData = oneExpander->dmDevice; DM_DBG3(("dmUpStreamDiscoverExpanderPhySkip: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmUpStreamDiscoverExpanderPhySkip: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); oneExpander->discoveringPhyId++; if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG3(("dmUpStreamDiscoverExpanderPhySkip: More Phys to discover\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } else { DM_DBG3(("dmUpStreamDiscoverExpanderPhySkip: No More Phys\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue upstream discovering */ dmUpStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG3(("dmUpStreamDiscoverExpanderPhySkip: onePortContext->discovery.status not in DISCOVERY_UP_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG3(("dmUpStreamDiscoverExpanderPhySkip: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } osGLOBAL void dmUpStreamDiscover2ExpanderPhySkip( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmDeviceData_t *oneDeviceData; DM_DBG2(("dmUpStreamDiscover2ExpanderPhySkip: start\n")); oneDeviceData = oneExpander->dmDevice; oneExpander->discoveringPhyId++; if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG2(("dmUpStreamDiscover2ExpanderPhySkip: DISCOVERY_UP_STREAM find more ...\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } else { DM_DBG2(("dmUpStreamDiscover2ExpanderPhySkip: DISCOVERY_UP_STREAM last phy continue upstream..\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue upstream discovering */ dmUpStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG2(("dmUpStreamDiscover2ExpanderPhySkip: onePortContext->discovery.status not in DISCOVERY_UP_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG2(("dmUpStreamDiscover2ExpanderPhySkip: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } osGLOBAL void dmDownStreamDiscoverExpanderPhySkip( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmDeviceData_t *oneDeviceData; DM_DBG3(("dmDownStreamDiscoverExpanderPhySkip: start\n")); oneDeviceData = oneExpander->dmDevice; DM_DBG3(("dmDownStreamDiscoverExpanderPhySkip: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDownStreamDiscoverExpanderPhySkip: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); /* Increment the discovering phy id */ oneExpander->discoveringPhyId ++; /* If the discovery status is DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM ) { /* If not the last phy */ if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG3(("dmDownStreamDiscoverExpanderPhySkip: More Phys to discover\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } /* If the last phy */ else { DM_DBG3(("dmDownStreamDiscoverExpanderPhySkip: No More Phys\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue downstream discovering */ dmDownStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG3(("dmDownStreamDiscoverExpanderPhySkip: onePortContext->discovery.status not in DISCOVERY_DOWN_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG3(("dmDownStreamDiscoverExpanderPhySkip: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } osGLOBAL void dmDownStreamDiscover2ExpanderPhySkip( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmDeviceData_t *oneDeviceData; DM_DBG2(("dmDownStreamDiscover2ExpanderPhySkip: start\n")); oneDeviceData = oneExpander->dmDevice; /* Increment the discovering phy id */ oneExpander->discoveringPhyId ++; /* If the discovery status is DISCOVERY_DOWN_STREAM */ if ( onePortContext->discovery.status == DISCOVERY_DOWN_STREAM ) { /* If not the last phy */ if ( oneExpander->discoveringPhyId < oneDeviceData->numOfPhys ) { DM_DBG2(("dmDownStreamDiscover2ExpanderPhySkip: More Phys to discover\n")); /* continue discovery for the next phy */ dmDiscoverSend(dmRoot, oneDeviceData); } /* If the last phy */ else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhySkip: No More Phys\n")); /* for MCN */ dmUpdateAllAdjacent(dmRoot, onePortContext, oneDeviceData); /* remove the expander from the discovering list */ dmDiscoveringExpanderRemove(dmRoot, onePortContext, oneExpander); /* continue downstream discovering */ dmDownStreamDiscovering(dmRoot, onePortContext, oneDeviceData); } } else { DM_DBG2(("dmDownStreamDiscover2ExpanderPhySkip: onePortContext->discovery.status not in DISCOVERY_DOWN_STREAM; status %d\n", onePortContext->discovery.status)); } DM_DBG2(("dmDownStreamDiscover2ExpanderPhySkip: end return phyID#%d\n", oneExpander->discoveringPhyId - 1)); return; } osGLOBAL void dmExpanderUpStreamPhyAdd( dmRoot_t *dmRoot, dmExpander_t *oneExpander, bit8 phyId ) { bit32 i; bit32 hasSet = agFALSE; DM_DBG3(("dmExpanderUpStreamPhyAdd: start, phyid %d\n", phyId)); DM_DBG3(("dmExpanderUpStreamPhyAdd: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmExpanderUpStreamPhyAdd: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmExpanderUpStreamPhyAdd: phyid %d numOfUpStreamPhys %d\n", phyId, oneExpander->numOfUpStreamPhys)); for ( i = 0; i < oneExpander->numOfUpStreamPhys; i ++ ) { if ( oneExpander->upStreamPhys[i] == phyId ) { hasSet = agTRUE; break; } } if ( hasSet == agFALSE ) { oneExpander->upStreamPhys[oneExpander->numOfUpStreamPhys ++] = phyId; } DM_DBG3(("dmExpanderUpStreamPhyAdd: AFTER phyid %d numOfUpStreamPhys %d\n", phyId, oneExpander->numOfUpStreamPhys)); /* for debugging */ for ( i = 0; i < oneExpander->numOfUpStreamPhys; i ++ ) { DM_DBG3(("dmExpanderUpStreamPhyAdd: index %d upstream[index] %d\n", i, oneExpander->upStreamPhys[i])); } return; } osGLOBAL void dmExpanderDownStreamPhyAdd( dmRoot_t *dmRoot, dmExpander_t *oneExpander, bit8 phyId ) { bit32 i; bit32 hasSet = agFALSE; DM_DBG3(("dmExpanderDownStreamPhyAdd: start, phyid %d\n", phyId)); DM_DBG3(("dmExpanderDownStreamPhyAdd: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmExpanderDownStreamPhyAdd: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmExpanderDownStreamPhyAdd: phyid %d numOfDownStreamPhys %d\n", phyId, oneExpander->numOfDownStreamPhys)); for ( i = 0; i < oneExpander->numOfDownStreamPhys; i ++ ) { if ( oneExpander->downStreamPhys[i] == phyId ) { hasSet = agTRUE; break; } } if ( hasSet == agFALSE ) { oneExpander->downStreamPhys[oneExpander->numOfDownStreamPhys ++] = phyId; } DM_DBG3(("dmExpanderDownStreamPhyAdd: AFTER phyid %d numOfDownStreamPhys %d\n", phyId, oneExpander->numOfDownStreamPhys)); /* for debugging */ for ( i = 0; i < oneExpander->numOfDownStreamPhys; i ++ ) { DM_DBG3(("dmExpanderDownStreamPhyAdd: index %d downstream[index] %d\n", i, oneExpander->downStreamPhys[i])); } return; } osGLOBAL void dmDiscoveryReportMCN( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit16 extension = 0; dmDeviceData_t *oneAttachedExpDeviceData = agNULL; DM_DBG2(("dmDiscoveryReportMCN: start\n")); /* if full disocvery, report all devices using MCN if incremental discovery, 1. compare MCN and PrevMCN 2. report the changed ones; report MCN 3. set PrevMCN to MCN PrevMCN = MCN */ DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if ( oneDeviceData == agNULL) { DM_DBG1(("dmDiscoveryReportMCN: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDiscoveryReportMCN: loop did %d\n", oneDeviceData->id)); if (oneDeviceData->dmPortContext == onePortContext) { DM_DBG2(("dmDiscoveryReportMCN: oneDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG2(("dmDiscoveryReportMCN: MCN 0x%08x PrevMCN 0x%08x\n", oneDeviceData->MCN, oneDeviceData->PrevMCN)); if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { DM_DBG2(("dmDiscoveryReportMCN: FULL_START\n")); } else { DM_DBG2(("dmDiscoveryReportMCN: INCREMENTAL_START\n")); } /* if MCN is 0, the device is removed */ if (oneDeviceData->MCN != oneDeviceData->PrevMCN && oneDeviceData->MCN != 0) { DM_DBG2(("dmDiscoveryReportMCN: reporting \n")); extension = oneDeviceData->dmDeviceInfo.ext; /* zero out MCN in extension */ extension = extension & 0x7FF; /* sets MCN in extension */ extension = extension | (oneDeviceData->MCN << 11); DEVINFO_PUT_EXT(&(oneDeviceData->dmDeviceInfo), extension); DM_DBG5(("dmDiscoveryReportMCN: MCN 0x%08x PrevMCN 0x%08x\n", DEVINFO_GET_EXT_MCN(&(oneDeviceData->dmDeviceInfo)), oneDeviceData->PrevMCN)); if (oneDeviceData->ExpDevice != agNULL) { DM_DBG2(("dmDiscoveryReportMCN: attached expander case\n")); oneAttachedExpDeviceData = oneDeviceData->ExpDevice; tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, &oneAttachedExpDeviceData->dmDeviceInfo, dmDeviceMCNChange); } else { DM_DBG2(("dmDiscoveryReportMCN: No attached expander case\n")); tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, agNULL, dmDeviceMCNChange); } oneDeviceData->PrevMCN = oneDeviceData->MCN; } else { DM_DBG2(("dmDiscoveryReportMCN: No change; no reporting \n")); if (oneDeviceData->MCN == 0) { oneDeviceData->PrevMCN = oneDeviceData->MCN; } } } DeviceListList = DeviceListList->flink; } return; } osGLOBAL void dmDiscoveryDumpMCN( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG3(("dmDiscoveryDumpMCN: start\n")); DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmDiscoveryDumpMCN: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDiscoveryDumpMCN: loop did %d\n", oneDeviceData->id)); if (oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmDiscoveryDumpMCN: oneDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmDiscoveryDumpMCN: MCN 0x%08x PrevMCN 0x%08x\n", oneDeviceData->MCN, oneDeviceData->PrevMCN)); } DeviceListList = DeviceListList->flink; } return; } osGLOBAL void dmDiscoveryResetMCN( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG2(("dmDiscoveryResetMCN: start\n")); /* reinitialize the device data belonging to this portcontext */ DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmDiscoveryResetMCN: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDiscoveryResetMCN: loop did %d\n", oneDeviceData->id)); if (oneDeviceData->dmPortContext == onePortContext) { if (oneDeviceData->ExpDevice != agNULL) { DM_DBG2(("dmDiscoveryResetMCN: resetting oneDeviceData->ExpDevice\n")); oneDeviceData->ExpDevice = agNULL; } DM_DBG3(("dmDiscoveryResetMCN: resetting MCN and MCNdone\n")); oneDeviceData->MCN = 0; oneDeviceData->MCNDone = agFALSE; DM_DBG2(("dmDiscoveryResetMCN: oneDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); } DeviceListList = DeviceListList->flink; } return; } /* do min(oneDeviceData, found-one) in all upstream and downstream find ajcanent expanders and mark it done; sees only ajcacent targets */ osGLOBAL void dmUpdateAllAdjacent( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData /* current one */ ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *tmponeDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG2(("dmUpdateAllAdjacent: start\n")); if (oneDeviceData == agNULL) { DM_DBG1(("dmUpdateAllAdjacent: oneDeviceData is NULL!!!\n")); return; } oneDeviceData->MCNDone = agTRUE; DM_DBG2(("dmUpdateAllAdjacent: oneDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { tmponeDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if ( tmponeDeviceData == agNULL) { DM_DBG1(("dmUpdateAllAdjacent: tmponeDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmUpdateAllAdjacent: loop did %d\n", tmponeDeviceData->id)); if (tmponeDeviceData->dmPortContext == onePortContext && tmponeDeviceData->ExpDevice == oneDeviceData) { DM_DBG2(("dmUpdateAllAdjacent: setting MCN DONE\n")); DM_DBG2(("dmUpdateAllAdjacent: tmponeDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", tmponeDeviceData->SASAddressID.sasAddressHi, tmponeDeviceData->SASAddressID.sasAddressLo)); tmponeDeviceData->MCNDone = agTRUE; if (oneDeviceData->directlyAttached == agFALSE) { DM_DBG2(("dmUpdateAllAdjacent: tmponeDeviceData MCN 0x%x\n", tmponeDeviceData->MCN)); DM_DBG2(("dmUpdateAllAdjacent: oneDeviceData MCN 0x%x\n", oneDeviceData->MCN)); tmponeDeviceData->MCN = MIN(oneDeviceData->MCN, tmponeDeviceData->MCN); } } DeviceListList = DeviceListList->flink; } return; } osGLOBAL void dmUpdateMCN( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *AdjacentDeviceData, /* adjacent expander */ dmDeviceData_t *oneDeviceData /* current one */ ) { DM_DBG2(("dmUpdateMCN: start\n")); if (AdjacentDeviceData == agNULL) { DM_DBG1(("dmUpdateMCN: AdjacentDeviceData is NULL!!!\n")); return; } if (oneDeviceData == agNULL) { DM_DBG1(("dmUpdateMCN: oneDeviceData is NULL!!!\n")); return; } DM_DBG2(("dmUpdateMCN: Current sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG2(("dmUpdateMCN: AdjacentDeviceData one sasAddressHi 0x%08x sasAddressLo 0x%08x\n", AdjacentDeviceData->SASAddressID.sasAddressHi, AdjacentDeviceData->SASAddressID.sasAddressLo)); if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { DM_DBG2(("dmUpdateMCN: DISCOVERY_UP_STREAM\n")); } if (onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG2(("dmUpdateMCN: DISCOVERY_DOWN_STREAM\n")); } /* MCN */ /* directly attached one does not have MCN update only adjacent device data */ if (oneDeviceData->directlyAttached == agTRUE && AdjacentDeviceData->MCNDone == agFALSE) { AdjacentDeviceData->MCN++; DM_DBG2(("dmUpdateMCN: case 1 oneDeviceData MCN 0x%x\n", oneDeviceData->MCN)); DM_DBG2(("dmUpdateMCN: case 1 AdjacentDeviceData MCN 0x%x\n", AdjacentDeviceData->MCN)); } else if (AdjacentDeviceData->MCNDone == agFALSE) { AdjacentDeviceData->MCN++; AdjacentDeviceData->MCN = MIN(oneDeviceData->MCN, AdjacentDeviceData->MCN); DM_DBG2(("dmUpdateMCN: case 2 oneDeviceData MCN 0x%x\n", oneDeviceData->MCN)); DM_DBG2(("dmUpdateMCN: case 2 AdjacentDeviceData MCN 0x%x\n", AdjacentDeviceData->MCN)); } return; } /* go through expander list and device list array ??? */ osGLOBAL dmDeviceData_t * dmPortSASDeviceFind( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 sasAddrLo, bit32 sasAddrHi, dmDeviceData_t *CurrentDeviceData /* current expander */ ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData, *RetDeviceData=agNULL; dmList_t *DeviceListList; DM_DBG3(("dmPortSASDeviceFind: start\n")); DM_DBG3(("dmPortSASDeviceFind: sasAddressHi 0x%08x sasAddressLo 0x%08x\n", sasAddrHi, sasAddrLo)); DM_ASSERT((agNULL != dmRoot), ""); DM_ASSERT((agNULL != onePortContext), ""); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); /* find a device's existence */ DeviceListList = dmAllShared->MainDeviceList.flink; if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { DM_DBG3(("dmPortSASDeviceFind: Full discovery\n")); while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmPortSASDeviceFind: oneDeviceData is NULL!!!\n")); return agNULL; } if ((oneDeviceData->SASAddressID.sasAddressHi == sasAddrHi) && (oneDeviceData->SASAddressID.sasAddressLo == sasAddrLo) && (oneDeviceData->valid == agTRUE) && (oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmPortSASDeviceFind: Found pid %d did %d\n", onePortContext->id, oneDeviceData->id)); DM_DBG3(("dmPortSASDeviceFind: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmPortSASDeviceFind: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); RetDeviceData = oneDeviceData; dmUpdateMCN(dmRoot, onePortContext, RetDeviceData, CurrentDeviceData); break; } DeviceListList = DeviceListList->flink; } } else { /* incremental discovery */ DM_DBG3(("dmPortSASDeviceFind: Incremental discovery\n")); while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmPortSASDeviceFind: oneDeviceData is NULL!!!\n")); return agNULL; } if ((oneDeviceData->SASAddressID.sasAddressHi == sasAddrHi) && (oneDeviceData->SASAddressID.sasAddressLo == sasAddrLo) && (oneDeviceData->valid2 == agTRUE) && (oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmPortSASDeviceFind: Found pid %d did %d\n", onePortContext->id, oneDeviceData->id)); DM_DBG3(("dmPortSASDeviceFind: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmPortSASDeviceFind: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); RetDeviceData = oneDeviceData; dmUpdateMCN(dmRoot, onePortContext, RetDeviceData, CurrentDeviceData); break; } DeviceListList = DeviceListList->flink; } } tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); return RetDeviceData; } bit32 dmNewEXPorNot( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmSASSubID_t *dmSASSubID ) { // dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; // dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmExpander_t *oneExpander = agNULL; dmList_t *ExpanderList; bit32 ret = agTRUE; dmDeviceData_t *oneDeviceData = agNULL; DM_DBG3(("dmNewEXPorNot: start\n")); /* find a device's existence */ ExpanderList = onePortContext->discovery.discoveringExpanderList.flink; while (ExpanderList != &(onePortContext->discovery.discoveringExpanderList)) { oneExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if ( oneExpander == agNULL) { DM_DBG1(("dmNewEXPorNot: oneExpander is NULL!!!\n")); return agFALSE; } oneDeviceData = oneExpander->dmDevice; if ((oneDeviceData->SASAddressID.sasAddressHi == dmSASSubID->sasAddressHi) && (oneDeviceData->SASAddressID.sasAddressLo == dmSASSubID->sasAddressLo) && (oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmNewEXPorNot: Found pid %d did %d\n", onePortContext->id, oneDeviceData->id)); ret = agFALSE; break; } ExpanderList = ExpanderList->flink; } return ret; } bit32 dmNewSASorNot( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmSASSubID_t *dmSASSubID ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit32 ret = agTRUE; DM_DBG3(("dmNewSASorNot: start\n")); /* find a device's existence */ DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmNewSASorNot: oneDeviceData is NULL!!!\n")); return agFALSE; } if ((oneDeviceData->SASAddressID.sasAddressHi == dmSASSubID->sasAddressHi) && (oneDeviceData->SASAddressID.sasAddressLo == dmSASSubID->sasAddressLo) && (oneDeviceData->dmPortContext == onePortContext) && (oneDeviceData->registered == agTRUE) ) { DM_DBG3(("dmNewSASorNot: Found pid %d did %d\n", onePortContext->id, oneDeviceData->id)); ret = agFALSE; break; } DeviceListList = DeviceListList->flink; } return ret; } /* call osGLOBAL bit32 tddmReportDevice( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext, dmDeviceInfo_t *dmDeviceInfo ) if not reported, report Device to TDM */ osGLOBAL dmDeviceData_t * dmPortSASDeviceAdd( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, agsaSASIdentify_t sasIdentify, bit32 sasInitiator, bit8 connectionRate, bit32 itNexusTimeout, bit32 firstBurstSize, bit32 deviceType, dmDeviceData_t *oneExpDeviceData, dmExpander_t *dmExpander, bit8 phyID ) { dmDeviceData_t *oneDeviceData = agNULL; bit8 dev_s_rate = 0; bit8 sasorsata = 1; dmSASSubID_t dmSASSubID; bit8 ExpanderConnectionRate = connectionRate; dmDeviceData_t *oneAttachedExpDeviceData = agNULL; bit16 extension = 0; bit32 current_link_rate = 0; DM_DBG3(("dmPortSASDeviceAdd: start\n")); DM_DBG3(("dmPortSASDeviceAdd: connectionRate %d\n", connectionRate)); dmSASSubID.sasAddressHi = SA_IDFRM_GET_SAS_ADDRESSHI(&sasIdentify); dmSASSubID.sasAddressLo = SA_IDFRM_GET_SAS_ADDRESSLO(&sasIdentify); dmSASSubID.initiator_ssp_stp_smp = sasIdentify.initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = sasIdentify.target_ssp_stp_smp; if (oneExpDeviceData != agNULL) { ExpanderConnectionRate = DEVINFO_GET_LINKRATE(&oneExpDeviceData->agDeviceInfo); DM_DBG3(("dmPortSASDeviceAdd: ExpanderConnectionRate 0x%x\n", ExpanderConnectionRate)); } if (oneExpDeviceData != agNULL) { if (oneExpDeviceData->SASAddressID.sasAddressHi == 0x0 && oneExpDeviceData->SASAddressID.sasAddressLo == 0x0) { DM_DBG1(("dmPortSASDeviceAdd: 1st Wrong expander!!!\n")); } } /* old device and already reported to TDM */ if ( agFALSE == dmNewSASorNot( dmRoot, onePortContext, &dmSASSubID ) ) /* old device */ { DM_DBG3(("dmPortSASDeviceAdd: OLD qqqq initiator_ssp_stp_smp %d target_ssp_stp_smp %d\n", dmSASSubID.initiator_ssp_stp_smp, dmSASSubID.target_ssp_stp_smp)); /* allocate a new device and set the valid bit */ oneDeviceData = dmAddSASToSharedcontext( dmRoot, onePortContext, &dmSASSubID, oneExpDeviceData, phyID ); if (oneDeviceData == agNULL) { DM_DBG1(("dmPortSASDeviceAdd: no more device, oneDeviceData is null!!!\n")); } /* If a device is allocated */ if ( oneDeviceData != agNULL ) { if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { DM_DBG3(("dmPortSASDeviceAdd: OLD, UP_STREAM\n")); } if (onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG3(("dmPortSASDeviceAdd: OLD, DOWN_STREAM\n")); } if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { DM_DBG3(("dmPortSASDeviceAdd: FULL_START\n")); oneDeviceData->MCN++; } else { /* incremental */ DM_DBG3(("dmPortSASDeviceAdd: INCREMENTAL_START\n")); if (oneDeviceData->MCN == 0 && oneDeviceData->directlyAttached == agFALSE) { oneDeviceData->MCN++; } } DM_DBG3(("dmPortSASDeviceAdd: oneDeviceData MCN 0x%08x\n", oneDeviceData->MCN)); DM_DBG3(("dmPortSASDeviceAdd: oneDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmPortSASDeviceAdd: sasAddressHi 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSHI(&sasIdentify))); DM_DBG3(("dmPortSASDeviceAdd: sasAddressLo 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSLO(&sasIdentify))); // oneDeviceData->sasIdentify = sasIdentify; dm_memcpy(&(oneDeviceData->sasIdentify), &sasIdentify, sizeof(agsaSASIdentify_t)); DM_DBG3(("dmPortSASDeviceAdd: sasAddressHi 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSHI(&oneDeviceData->sasIdentify))); DM_DBG3(("dmPortSASDeviceAdd: sasAddressLo 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSLO(&oneDeviceData->sasIdentify))); /* parse sasIDframe to fill in agDeviceInfo */ DEVINFO_PUT_SMPTO(&oneDeviceData->agDeviceInfo, DEFAULT_SMP_TIMEOUT); DEVINFO_PUT_ITNEXUSTO(&oneDeviceData->agDeviceInfo, (bit16)itNexusTimeout); DEVINFO_PUT_FBS(&oneDeviceData->agDeviceInfo, (bit16)firstBurstSize); DEVINFO_PUT_FLAG(&oneDeviceData->agDeviceInfo, 1); oneDeviceData->SASSpecDeviceType = SA_IDFRM_GET_DEVICETTYPE(&sasIdentify); /* adjusting connectionRate */ oneAttachedExpDeviceData = oneDeviceData->ExpDevice; if (oneAttachedExpDeviceData != agNULL) { connectionRate = MIN(connectionRate, DEVINFO_GET_LINKRATE(&oneAttachedExpDeviceData->agDeviceInfo)); DM_DBG3(("dmPortSASDeviceAdd: 1st connectionRate 0x%x DEVINFO_GET_LINKRATE(&oneAttachedExpDeviceData->agDeviceInfo) 0x%x\n", connectionRate, DEVINFO_GET_LINKRATE(&oneAttachedExpDeviceData->agDeviceInfo))); } else { DM_DBG3(("dmPortSASDeviceAdd: 1st oneAttachedExpDeviceData is NULL\n")); } /* Device Type, SAS or SATA, connection rate; bit7 --- bit0 */ sasorsata = (bit8)deviceType; /* sTSDK spec device typ */ dev_s_rate = dev_s_rate | (sasorsata << 4); dev_s_rate = dev_s_rate | MIN(connectionRate, ExpanderConnectionRate); /* detect link rate change */ current_link_rate = DEVINFO_GET_LINKRATE(&oneDeviceData->agDeviceInfo); if (current_link_rate != (bit32)MIN(connectionRate, ExpanderConnectionRate)) { DM_DBG1(("dmPortSASDeviceAdd: link rate changed current 0x%x new 0x%x\n", current_link_rate, MIN(connectionRate, ExpanderConnectionRate))); DEVINFO_PUT_DEV_S_RATE(&oneDeviceData->dmDeviceInfo, dev_s_rate); if (oneDeviceData->ExpDevice != agNULL) { oneAttachedExpDeviceData = oneDeviceData->ExpDevice; tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, &oneAttachedExpDeviceData->dmDeviceInfo, dmDeviceRateChange); } else { tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, agNULL, dmDeviceArrival); } } DEVINFO_PUT_DEV_S_RATE(&oneDeviceData->agDeviceInfo, dev_s_rate); DEVINFO_PUT_SAS_ADDRESSLO( &oneDeviceData->agDeviceInfo, SA_IDFRM_GET_SAS_ADDRESSLO(&oneDeviceData->sasIdentify) ); DEVINFO_PUT_SAS_ADDRESSHI( &oneDeviceData->agDeviceInfo, SA_IDFRM_GET_SAS_ADDRESSHI(&oneDeviceData->sasIdentify) ); oneDeviceData->agContext.osData = oneDeviceData; oneDeviceData->agContext.sdkData = agNULL; } return oneDeviceData; } /* old device */ /* new device */ DM_DBG3(("dmPortSASDeviceAdd: NEW qqqq initiator_ssp_stp_smp %d target_ssp_stp_smp %d\n", dmSASSubID.initiator_ssp_stp_smp, dmSASSubID.target_ssp_stp_smp)); /* allocate a new device and set the valid bit */ oneDeviceData = dmAddSASToSharedcontext( dmRoot, onePortContext, &dmSASSubID, oneExpDeviceData, phyID ); if (oneDeviceData == agNULL) { DM_DBG1(("dmPortSASDeviceAdd: no more device, oneDeviceData is null !!!\n")); } /* If a device is allocated */ if ( oneDeviceData != agNULL ) { // DM_DBG3(("dmPortSASDeviceAdd: sasAddressHi 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSHI(&sasIdentify))); // DM_DBG3(("dmPortSASDeviceAdd: sasAddressLo 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSLO(&sasIdentify))); // oneDeviceData->sasIdentify = sasIdentify; dm_memcpy(&(oneDeviceData->sasIdentify), &sasIdentify, sizeof(agsaSASIdentify_t)); if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { DM_DBG3(("dmPortSASDeviceAdd: NEW, UP_STREAM\n")); } if (onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG3(("dmPortSASDeviceAdd: NEW, DOWN_STREAM\n")); } if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { DM_DBG3(("dmPortSASDeviceAdd: FULL_START\n")); oneDeviceData->MCN++; } else { /* incremental */ DM_DBG3(("dmPortSASDeviceAdd: INCREMENTAL_START\n")); if (oneDeviceData->MCN == 0 && oneDeviceData->directlyAttached == agFALSE) { oneDeviceData->MCN++; } } DM_DBG3(("dmPortSASDeviceAdd: oneDeviceData MCN 0x%08x\n", oneDeviceData->MCN)); DM_DBG3(("dmPortSASDeviceAdd: oneDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmPortSASDeviceAdd: sasAddressHi 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSHI(&oneDeviceData->sasIdentify))); DM_DBG3(("dmPortSASDeviceAdd: sasAddressLo 0x%08x\n", SA_IDFRM_GET_SAS_ADDRESSLO(&oneDeviceData->sasIdentify))); /* parse sasIDframe to fill in agDeviceInfo */ DEVINFO_PUT_SMPTO(&oneDeviceData->agDeviceInfo, DEFAULT_SMP_TIMEOUT); DEVINFO_PUT_ITNEXUSTO(&oneDeviceData->agDeviceInfo, (bit16)itNexusTimeout); DEVINFO_PUT_FBS(&oneDeviceData->agDeviceInfo, (bit16)firstBurstSize); DEVINFO_PUT_FLAG(&oneDeviceData->agDeviceInfo, 1); oneDeviceData->SASSpecDeviceType = SA_IDFRM_GET_DEVICETTYPE(&sasIdentify); /* adjusting connectionRate */ oneAttachedExpDeviceData = oneDeviceData->ExpDevice; if (oneAttachedExpDeviceData != agNULL) { connectionRate = MIN(connectionRate, DEVINFO_GET_LINKRATE(&oneAttachedExpDeviceData->agDeviceInfo)); DM_DBG3(("dmPortSASDeviceAdd: 2nd connectionRate 0x%x DEVINFO_GET_LINKRATE(&oneAttachedExpDeviceData->agDeviceInfo) 0x%x\n", connectionRate, DEVINFO_GET_LINKRATE(&oneAttachedExpDeviceData->agDeviceInfo))); } else { DM_DBG3(("dmPortSASDeviceAdd: 2nd oneAttachedExpDeviceData is NULL\n")); } /* Device Type, SAS or SATA, connection rate; bit7 --- bit0 */ sasorsata = (bit8)deviceType; dev_s_rate = dev_s_rate | (sasorsata << 4); dev_s_rate = dev_s_rate | MIN(connectionRate, ExpanderConnectionRate); DEVINFO_PUT_DEV_S_RATE(&oneDeviceData->agDeviceInfo, dev_s_rate); DEVINFO_PUT_SAS_ADDRESSLO( &oneDeviceData->agDeviceInfo, SA_IDFRM_GET_SAS_ADDRESSLO(&oneDeviceData->sasIdentify) ); DEVINFO_PUT_SAS_ADDRESSHI( &oneDeviceData->agDeviceInfo, SA_IDFRM_GET_SAS_ADDRESSHI(&oneDeviceData->sasIdentify) ); oneDeviceData->agContext.osData = oneDeviceData; oneDeviceData->agContext.sdkData = agNULL; DM_DBG3(("dmPortSASDeviceAdd: did %d\n", oneDeviceData->id)); /* reporting to TDM; setting dmDeviceInfo */ DEVINFO_PUT_SMPTO(&oneDeviceData->dmDeviceInfo, DEFAULT_SMP_TIMEOUT); DEVINFO_PUT_ITNEXUSTO(&oneDeviceData->dmDeviceInfo, (bit16)itNexusTimeout); DEVINFO_PUT_FBS(&oneDeviceData->dmDeviceInfo, (bit16)firstBurstSize); DEVINFO_PUT_FLAG(&oneDeviceData->dmDeviceInfo, 1); DEVINFO_PUT_INITIATOR_SSP_STP_SMP(&oneDeviceData->dmDeviceInfo, dmSASSubID.initiator_ssp_stp_smp); DEVINFO_PUT_TARGET_SSP_STP_SMP(&oneDeviceData->dmDeviceInfo, dmSASSubID.target_ssp_stp_smp); extension = phyID; /* setting 6th bit of dev_s_rate */ if (oneDeviceData->SASSpecDeviceType == SAS_EDGE_EXPANDER_DEVICE || oneDeviceData->SASSpecDeviceType == SAS_FANOUT_EXPANDER_DEVICE ) { extension = (bit16)(extension | (1 << 8)); } DEVINFO_PUT_EXT(&oneDeviceData->dmDeviceInfo, extension); DEVINFO_PUT_DEV_S_RATE(&oneDeviceData->dmDeviceInfo, dev_s_rate); DEVINFO_PUT_SAS_ADDRESSLO( &oneDeviceData->dmDeviceInfo, SA_IDFRM_GET_SAS_ADDRESSLO(&oneDeviceData->sasIdentify) ); DEVINFO_PUT_SAS_ADDRESSHI( &oneDeviceData->dmDeviceInfo, SA_IDFRM_GET_SAS_ADDRESSHI(&oneDeviceData->sasIdentify) ); if (oneDeviceData->ExpDevice != agNULL) { DM_DBG3(("dmPortSASDeviceAdd: attached expander case\n")); oneAttachedExpDeviceData = oneDeviceData->ExpDevice; /* Puts attached expander's SAS address into dmDeviceInfo */ DEVINFO_PUT_SAS_ADDRESSLO( &oneAttachedExpDeviceData->dmDeviceInfo, oneAttachedExpDeviceData->SASAddressID.sasAddressLo ); DEVINFO_PUT_SAS_ADDRESSHI( &oneAttachedExpDeviceData->dmDeviceInfo, oneAttachedExpDeviceData->SASAddressID.sasAddressHi ); DM_DBG3(("dmPortSASDeviceAdd: oneAttachedExpDeviceData addrHi 0x%08x addrLo 0x%08x PhyID 0x%x ext 0x%x\n", DM_GET_SAS_ADDRESSHI(oneAttachedExpDeviceData->dmDeviceInfo.sasAddressHi), DM_GET_SAS_ADDRESSLO(oneAttachedExpDeviceData->dmDeviceInfo.sasAddressLo), phyID, extension)); if (oneAttachedExpDeviceData->SASAddressID.sasAddressHi == 0x0 && oneAttachedExpDeviceData->SASAddressID.sasAddressLo == 0x0) { DM_DBG1(("dmPortSASDeviceAdd: 2nd Wrong expander!!!\n")); } if (oneDeviceData->reported == agFALSE) { oneDeviceData->registered = agTRUE; oneDeviceData->reported = agTRUE; if (deviceType == STP_DEVICE_TYPE) { /*STP device, DM need send SMP Report Phy SATA to get the SATA device type */ oneAttachedExpDeviceData->dmExpander->dmDeviceToProcess = oneDeviceData; dmReportPhySataSend(dmRoot, oneAttachedExpDeviceData, phyID); } else { /* SAS or SMP device */ tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, &oneAttachedExpDeviceData->dmDeviceInfo, dmDeviceArrival); } } } else { DM_DBG3(("dmPortSASDeviceAdd: NO attached expander case\n")); if (oneDeviceData->reported == agFALSE) { oneDeviceData->registered = agTRUE; oneDeviceData->reported = agTRUE; tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, agNULL, dmDeviceArrival); } } } return oneDeviceData; } osGLOBAL dmDeviceData_t * dmFindRegNValid( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmSASSubID_t *dmSASSubID ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit32 found = agFALSE; DM_DBG3(("dmFindRegNValid: start\n")); /* find a device's existence */ DeviceListList = dmAllShared->MainDeviceList.flink; if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { DM_DBG3(("dmFindRegNValid: Full discovery\n")); while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmFindRegNValid: oneDeviceData is NULL!!!\n")); return agFALSE; } if ((oneDeviceData->SASAddressID.sasAddressHi == dmSASSubID->sasAddressHi) && (oneDeviceData->SASAddressID.sasAddressLo == dmSASSubID->sasAddressLo) && (oneDeviceData->valid == agTRUE) && (oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmFindRegNValid: Found pid %d did %d\n", onePortContext->id, oneDeviceData->id)); DM_DBG3(("dmFindRegNValid: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindRegNValid: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); found = agTRUE; break; } DeviceListList = DeviceListList->flink; } } else { /* incremental discovery */ DM_DBG3(("dmFindRegNValid: Incremental discovery\n")); while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmFindRegNValid: oneDeviceData is NULL!!!\n")); return agFALSE; } if ((oneDeviceData->SASAddressID.sasAddressHi == dmSASSubID->sasAddressHi) && (oneDeviceData->SASAddressID.sasAddressLo == dmSASSubID->sasAddressLo) && (oneDeviceData->valid2 == agTRUE) && (oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmFindRegNValid: Found pid %d did %d\n", onePortContext->id, oneDeviceData->id)); DM_DBG3(("dmFindRegNValid: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindRegNValid: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); found = agTRUE; break; } DeviceListList = DeviceListList->flink; } } if (found == agFALSE) { DM_DBG3(("dmFindRegNValid: end returning NULL\n")); return agNULL; } else { DM_DBG3(("dmFindRegNValid: end returning NOT NULL\n")); return oneDeviceData; } } osGLOBAL void dmNotifyBC( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext, bit32 type) { dmIntPortContext_t *onePortContext = agNULL; onePortContext = (dmIntPortContext_t *)dmPortContext->dmData; DM_DBG3(("dmNotifyBC: start\n")); if (onePortContext == agNULL) { DM_DBG1(("dmNotifyBC: onePortContext is NULL, wrong!!!\n")); return; } if (type == OSSA_HW_EVENT_BROADCAST_CHANGE) { if (onePortContext->DiscoveryAbortInProgress == agFALSE) { if (onePortContext->DiscoveryState == DM_DSTATE_COMPLETED) { DM_DBG3(("dmNotifyBC: BROADCAST_CHANGE\n")); onePortContext->DiscoveryState = DM_DSTATE_NOT_STARTED; onePortContext->discoveryOptions = DM_DISCOVERY_OPTION_INCREMENTAL_START; /* processed broadcast change */ onePortContext->discovery.SeenBC = agFALSE; } else { DM_DBG3(("dmNotifyBC: pid %d BROADCAST_CHANGE; updating SeenBC. Do nothing.\n", onePortContext->id)); onePortContext->discovery.SeenBC = agTRUE; } } } else if (type == OSSA_HW_EVENT_BROADCAST_SES) { DM_DBG3(("dmNotifyBC: OSSA_HW_EVENT_BROADCAST_SES\n")); } else if (type == OSSA_HW_EVENT_BROADCAST_EXP) { DM_DBG3(("dmNotifyBC: OSSA_HW_EVENT_BROADCAST_EXP\n")); } else { DM_DBG3(("dmNotifyBC: unspecified broadcast type 0x%x\n", type)); } return; } #ifdef WORKED /* triggers incremental discovery */ osGLOBAL void dmNotifyBC( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext, bit32 type) { dmIntPortContext_t *onePortContext = agNULL; onePortContext = (dmIntPortContext_t *)dmPortContext->dmData; DM_DBG3(("dmNotifyBC: start\n")); if (type == OSSA_HW_EVENT_BROADCAST_CHANGE) { if (onePortContext->DiscoveryState == DM_DSTATE_COMPLETED) { DM_DBG3(("dmNotifyBC: BROADCAST_CHANGE; does incremental discovery\n")); onePortContext->DiscoveryState = DM_DSTATE_NOT_STARTED; onePortContext->discoveryOptions = DM_DISCOVERY_OPTION_INCREMENTAL_START; /* processed broadcast change */ onePortContext->discovery.SeenBC = agFALSE; if (onePortContext->discovery.ResetTriggerred == agTRUE) { DM_DBG3(("dmNotifyBC: tdsaBCTimer\n")); dmBCTimer(dmRoot, onePortContext); } else { dmDiscover( dmRoot, dmPortContext, DM_DISCOVERY_OPTION_INCREMENTAL_START ); } } else { DM_DBG3(("dmNotifyBC: pid %d BROADCAST_CHANGE; updating SeenBC. Do nothing.\n", onePortContext->id)); onePortContext->discovery.SeenBC = agTRUE; } } else if (type == OSSA_HW_EVENT_BROADCAST_SES) { DM_DBG3(("dmNotifyBC: OSSA_HW_EVENT_BROADCAST_SES\n")); } else if (type == OSSA_HW_EVENT_BROADCAST_EXP) { DM_DBG3(("dmNotifyBC: OSSA_HW_EVENT_BROADCAST_EXP\n")); } else { DM_DBG3(("dmNotifyBC: unspecified broadcast type 0x%x\n", type)); } return; } #endif osGLOBAL bit32 dmResetFailedDiscovery( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext) { dmIntPortContext_t *onePortContext = agNULL; DM_DBG1(("dmResetFailedDiscovery: start\n")); onePortContext = (dmIntPortContext_t *)dmPortContext->dmData; if (onePortContext == agNULL) { DM_DBG1(("dmResetFailedDiscovery: onePortContext is NULL, wrong!!!\n")); return DM_RC_FAILURE; } if (onePortContext->DiscoveryState == DM_DSTATE_COMPLETED_WITH_FAILURE) { onePortContext->DiscoveryState = DM_DSTATE_COMPLETED; } else { DM_DBG1(("dmResetFailedDiscovery: discovery is NOT DM_DSTATE_COMPLETED_WITH_FAILURE. It is 0x%x\n", onePortContext->DiscoveryState)); return DM_RC_FAILURE; } return DM_RC_SUCCESS; } osGLOBAL bit32 dmQueryDiscovery( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext) { dmIntPortContext_t *onePortContext = agNULL; DM_DBG3(("dmQueryDiscovery: start\n")); onePortContext = (dmIntPortContext_t *)dmPortContext->dmData; if (onePortContext == agNULL) { DM_DBG1(("dmQueryDiscovery: onePortContext is NULL, wrong!!!\n")); return DM_RC_FAILURE; } /* call tddmQueryDiscoveryCB() */ if (onePortContext->DiscoveryState == DM_DSTATE_COMPLETED) { tddmQueryDiscoveryCB(dmRoot, dmPortContext, onePortContext->discoveryOptions, dmDiscCompleted); } else if (onePortContext->DiscoveryState == DM_DSTATE_COMPLETED_WITH_FAILURE) { tddmQueryDiscoveryCB(dmRoot, dmPortContext, onePortContext->discoveryOptions, dmDiscFailed); } else { tddmQueryDiscoveryCB(dmRoot, dmPortContext, onePortContext->discoveryOptions, dmDiscInProgress); } return DM_RC_SUCCESS; } /* should only for an expander */ osGLOBAL bit32 dmRegisterDevice( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext, dmDeviceInfo_t *dmDeviceInfo, agsaDevHandle_t *agDevHandle ) { dmIntPortContext_t *onePortContext = agNULL; dmExpander_t *oneExpander = agNULL; bit32 sasAddressHi, sasAddressLo; dmDeviceData_t *oneDeviceData = agNULL; dmSASSubID_t dmSASSubID; DM_DBG3(("dmRegisterDevice: start\n")); onePortContext = (dmIntPortContext_t *)dmPortContext->dmData; if (onePortContext == agNULL) { DM_DBG1(("dmRegisterDevice: onePortContext is NULL!!!\n")); return DM_RC_FAILURE; } if (onePortContext->valid == agFALSE) { DM_DBG1(("dmRegisterDevice: invalid port!!!\n")); return DM_RC_FAILURE; } onePortContext->RegFailed = agFALSE; /* tdssAddSASToSharedcontext() from ossaHwCB() osGLOBAL void tdssAddSASToSharedcontext( tdsaPortContext_t *tdsaPortContext_Instance, agsaRoot_t *agRoot, agsaDevHandle_t *agDevHandle, tdsaSASSubID_t *agSASSubID, bit32 registered, bit8 phyID, bit32 flag ); from discovery osGLOBAL tdsaDeviceData_t * tdssNewAddSASToSharedcontext( agsaRoot_t *agRoot, tdsaPortContext_t *onePortContext, tdsaSASSubID_t *agSASSubID, tdsaDeviceData_t *oneExpDeviceData, bit8 phyID ); */ /* start here */ dmSASSubID.sasAddressHi = DM_GET_SAS_ADDRESSHI(dmDeviceInfo->sasAddressHi); dmSASSubID.sasAddressLo = DM_GET_SAS_ADDRESSHI(dmDeviceInfo->sasAddressLo); dmSASSubID.initiator_ssp_stp_smp = dmDeviceInfo->initiator_ssp_stp_smp; dmSASSubID.target_ssp_stp_smp = dmDeviceInfo->target_ssp_stp_smp; oneDeviceData = dmAddSASToSharedcontext(dmRoot, onePortContext, &dmSASSubID, agNULL, 0xFF); if (oneDeviceData == agNULL) { DM_DBG1(("dmRegisterDevice: oneDeviceData is NULL!!!\n")); return DM_RC_FAILURE; } oneDeviceData->agDeviceInfo.devType_S_Rate = dmDeviceInfo->devType_S_Rate; dm_memcpy(oneDeviceData->agDeviceInfo.sasAddressHi, dmDeviceInfo->sasAddressHi, 4); dm_memcpy(oneDeviceData->agDeviceInfo.sasAddressLo, dmDeviceInfo->sasAddressLo, 4); /* finds the type of expanders */ if (DEVINFO_GET_EXT_SMP(dmDeviceInfo)) { if (DEVINFO_GET_EXT_EXPANDER_TYPE(dmDeviceInfo) == SAS_EDGE_EXPANDER_DEVICE) { oneDeviceData->SASSpecDeviceType = SAS_EDGE_EXPANDER_DEVICE; } else if (DEVINFO_GET_EXT_EXPANDER_TYPE(dmDeviceInfo) == SAS_FANOUT_EXPANDER_DEVICE) { oneDeviceData->SASSpecDeviceType = SAS_FANOUT_EXPANDER_DEVICE; } else { /* default */ DM_DBG4(("dmRegisterDevice: no expander type. default to edge expander\n")); oneDeviceData->SASSpecDeviceType = SAS_EDGE_EXPANDER_DEVICE; } } if (DEVINFO_GET_EXT_MCN(dmDeviceInfo) == 0xF) { DM_DBG1(("dmRegisterDevice: directly attached expander\n")); oneDeviceData->directlyAttached = agTRUE; oneDeviceData->dmDeviceInfo.ext = (bit16)(oneDeviceData->dmDeviceInfo.ext | (0xF << 11)); } else { DM_DBG1(("dmRegisterDevice: NOT directly attached expander\n")); oneDeviceData->directlyAttached = agFALSE; } if (onePortContext->DiscoveryState == DM_DSTATE_NOT_STARTED) { DM_DBG3(("dmRegisterDevice: DM_DSTATE_NOT_STARTED\n")); /* before the discovery is started */ oneExpander = dmDiscoveringExpanderAlloc(dmRoot, onePortContext, oneDeviceData); if ( oneExpander != agNULL) { oneExpander->agDevHandle = agDevHandle; /* update SAS address field */ oneExpander->dmDevice->SASAddressID.sasAddressHi = DM_GET_SAS_ADDRESSHI(dmDeviceInfo->sasAddressHi); oneExpander->dmDevice->SASAddressID.sasAddressLo = DM_GET_SAS_ADDRESSLO(dmDeviceInfo->sasAddressLo); DM_DBG3(("dmRegisterDevice: AddrHi 0x%08x AddrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi, oneExpander->dmDevice->SASAddressID.sasAddressLo)); dmDiscoveringExpanderAdd(dmRoot, onePortContext, oneExpander); } else { DM_DBG1(("dmRegisterDevice: failed to allocate expander !!!\n")); /* remember that the registration failed so that a discovery can't be started */ onePortContext->RegFailed = agTRUE; return DM_RC_FAILURE; } } else { /* the discovery has started. Alloc and add have been done. find an expander using dmDeviceInfo, and update the expander's agDevHandle call dmExpFind() */ DM_DBG3(("dmRegisterDevice: NOT DM_DSTATE_NOT_STARTED\n")); sasAddressHi = DM_GET_SAS_ADDRESSHI(dmDeviceInfo->sasAddressHi); sasAddressLo = DM_GET_SAS_ADDRESSLO(dmDeviceInfo->sasAddressLo); DM_DBG3(("dmRegisterDevice: AddrHi 0x%08x AddrLo 0x%08x\n", sasAddressHi, sasAddressLo)); oneExpander = dmExpFind(dmRoot, onePortContext, sasAddressHi, sasAddressLo); if ( oneExpander != agNULL) { oneExpander->agDevHandle = agDevHandle; } else { DM_DBG1(("dmRegisterDevice: not allowed case, wrong !!!\n")); return DM_RC_FAILURE; } } return DM_RC_SUCCESS; } osGLOBAL dmExpander_t * dmDiscoveringExpanderAlloc( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmExpander_t *oneExpander = agNULL; dmList_t *ExpanderList; DM_DBG3(("dmDiscoveringExpanderAlloc: start\n")); DM_DBG3(("dmDiscoveringExpanderAlloc: did %d\n", oneDeviceData->id)); DM_DBG3(("dmDiscoveringExpanderAlloc: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveringExpanderAlloc: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); if (onePortContext->valid == agFALSE) { DM_DBG1(("dmDiscoveringExpanderAlloc: invalid port!!!\n")); return agNULL; } /* check exitence in dmAllShared->mainExpanderList */ oneExpander = dmExpMainListFind(dmRoot, onePortContext, oneDeviceData->SASAddressID.sasAddressHi, oneDeviceData->SASAddressID.sasAddressLo); if (oneExpander == agNULL) { tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (DMLIST_EMPTY(&(dmAllShared->freeExpanderList))) { DM_DBG1(("dmDiscoveringExpanderAlloc: no free expanders pid %d!!!\n", onePortContext->id)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); return agNULL; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_DEQUEUE_FROM_HEAD(&ExpanderList, &(dmAllShared->freeExpanderList)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); oneExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); } if (oneExpander != agNULL) { DM_DBG1(("dmDiscoveringExpanderAlloc: pid %d exp id %d \n", onePortContext->id, oneExpander->id)); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_DEQUEUE_THIS(&(oneExpander->linkNode)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); oneExpander->dmDevice = oneDeviceData; oneExpander->dmUpStreamExpander = agNULL; oneExpander->dmCurrentDownStreamExpander = agNULL; oneExpander->dmReturnginExpander = agNULL; oneExpander->hasUpStreamDevice = agFALSE; oneExpander->numOfUpStreamPhys = 0; oneExpander->currentUpStreamPhyIndex = 0; oneExpander->discoveringPhyId = 0; oneExpander->underDiscovering = agFALSE; dm_memset( &(oneExpander->currentIndex), 0, sizeof(oneExpander->currentIndex)); oneDeviceData->dmExpander = oneExpander; DM_DBG3(("dmDiscoveringExpanderAlloc: oneDeviceData %p did %d\n", oneDeviceData, oneDeviceData->id)); DM_DBG3(("dmDiscoveringExpanderAlloc: oneExpander %p did %d\n", oneDeviceData->dmExpander, oneDeviceData->dmExpander->id)); } return oneExpander; } osGLOBAL void dmDiscoveringExpanderAdd( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { DM_DBG3(("dmDiscoveringExpanderAdd: start\n")); DM_DBG3(("dmDiscoveringExpanderAdd: expander id %d\n", oneExpander->id)); DM_DBG3(("dmDiscoveringExpanderAdd: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveringExpanderAdd: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); if (onePortContext->valid == agFALSE) { DM_DBG1(("dmDiscoveringExpanderAdd: invalid port!!!\n")); return; } if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { DM_DBG3(("dmDiscoveringExpanderAdd: UPSTREAM\n")); } else if (onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG3(("dmDiscoveringExpanderAdd: DOWNSTREAM\n")); } else { DM_DBG3(("dmDiscoveringExpanderAdd: status %d\n", onePortContext->discovery.status)); } if ( oneExpander->underDiscovering == agFALSE) { DM_DBG3(("dmDiscoveringExpanderAdd: ADDED \n")); oneExpander->underDiscovering = agTRUE; tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_ENQUEUE_AT_TAIL(&(oneExpander->linkNode), &(onePortContext->discovery.discoveringExpanderList)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } return; } osGLOBAL dmExpander_t * dmFindConfigurableExp( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmExpander_t *tempExpander; dmIntPortContext_t *tmpOnePortContext = onePortContext; dmExpander_t *ret = agNULL; DM_DBG3(("dmFindConfigurableExp: start\n")); if (oneExpander == agNULL) { DM_DBG3(("dmFindConfigurableExp: NULL expander\n")); return agNULL; } DM_DBG3(("dmFindConfigurableExp: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindConfigurableExp: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.discoveringExpanderList))) { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); DM_DBG3(("dmFindConfigurableExp: empty UpdiscoveringExpanderList\n")); return agNULL; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } tempExpander = oneExpander->dmUpStreamExpander; while (tempExpander) { DM_DBG3(("dmFindConfigurableExp: loop exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindConfigurableExp: loop exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); if (tempExpander->configRouteTable) { DM_DBG3(("dmFindConfigurableExp: found configurable expander\n")); ret = tempExpander; break; } tempExpander = tempExpander->dmUpStreamExpander; } return ret; } osGLOBAL bit32 dmDuplicateConfigSASAddr( dmRoot_t *dmRoot, dmExpander_t *oneExpander, bit32 configSASAddressHi, bit32 configSASAddressLo ) { bit32 i; bit32 ret = agFALSE; DM_DBG3(("dmDuplicateConfigSASAddr: start\n")); if (oneExpander == agNULL) { DM_DBG3(("dmDuplicateConfigSASAddr: NULL expander\n")); return agTRUE; } if (oneExpander->dmDevice->SASAddressID.sasAddressHi == configSASAddressHi && oneExpander->dmDevice->SASAddressID.sasAddressLo == configSASAddressLo ) { DM_DBG3(("dmDuplicateConfigSASAddr: unnecessary\n")); return agTRUE; } DM_DBG3(("dmDuplicateConfigSASAddr: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDuplicateConfigSASAddr: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmDuplicateConfigSASAddr: configsasAddressHi 0x%08x\n", configSASAddressHi)); DM_DBG3(("dmDuplicateConfigSASAddr: configsasAddressLo 0x%08x\n", configSASAddressLo)); DM_DBG3(("dmDuplicateConfigSASAddr: configSASAddrTableIndex %d\n", oneExpander->configSASAddrTableIndex)); for(i=0;iconfigSASAddrTableIndex;i++) { if (oneExpander->configSASAddressHiTable[i] == configSASAddressHi && oneExpander->configSASAddressLoTable[i] == configSASAddressLo ) { DM_DBG3(("dmDuplicateConfigSASAddr: FOUND\n")); ret = agTRUE; break; } } /* new one; let's add it */ if (ret == agFALSE) { DM_DBG3(("dmDuplicateConfigSASAddr: adding configSAS Addr\n")); DM_DBG3(("dmDuplicateConfigSASAddr: configSASAddrTableIndex %d\n", oneExpander->configSASAddrTableIndex)); oneExpander->configSASAddressHiTable[oneExpander->configSASAddrTableIndex] = configSASAddressHi; oneExpander->configSASAddressLoTable[oneExpander->configSASAddrTableIndex] = configSASAddressLo; oneExpander->configSASAddrTableIndex++; } return ret; } osGLOBAL bit16 dmFindCurrentDownStreamPhyIndex( dmRoot_t *dmRoot, dmExpander_t *oneExpander ) { dmExpander_t *DownStreamExpander; bit16 index = 0; bit16 i; bit8 phyId = 0; DM_DBG3(("dmFindCurrentDownStreamPhyIndex: start\n")); if (oneExpander == agNULL) { DM_DBG1(("dmFindCurrentDownStreamPhyIndex: wrong, oneExpander is NULL!!!\n")); return 0; } DownStreamExpander = oneExpander->dmCurrentDownStreamExpander; if (DownStreamExpander == agNULL) { DM_DBG1(("dmFindCurrentDownStreamPhyIndex: wrong, DownStreamExpander is NULL!!!\n")); return 0; } DM_DBG3(("dmFindCurrentDownStreamPhyIndex: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindCurrentDownStreamPhyIndex: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmFindCurrentDownStreamPhyIndex: downstream exp addrHi 0x%08x\n", DownStreamExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindCurrentDownStreamPhyIndex: downstream exp addrLo 0x%08x\n", DownStreamExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmFindCurrentDownStreamPhyIndex: numOfDownStreamPhys %d\n", oneExpander->numOfDownStreamPhys)); phyId = DownStreamExpander->upStreamPhys[0]; DM_DBG3(("dmFindCurrentDownStreamPhyIndex: phyId %d\n", phyId)); for (i=0; inumOfDownStreamPhys;i++) { if (oneExpander->downStreamPhys[i] == phyId) { index = i; break; } } DM_DBG3(("dmFindCurrentDownStreamPhyIndex: index %d\n", index)); return index; } osGLOBAL bit32 dmFindDiscoveringExpander( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmList_t *ExpanderList; dmExpander_t *tempExpander; dmIntPortContext_t *tmpOnePortContext = onePortContext; bit32 ret = agFALSE; DM_DBG3(("dmFindDiscoveringExpander: start\n")); DM_DBG3(("dmFindDiscoveringExpander: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindDiscoveringExpander: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.discoveringExpanderList))) { DM_DBG3(("dmFindDiscoveringExpander: empty discoveringExpanderList\n")); return ret; } ExpanderList = tmpOnePortContext->discovery.discoveringExpanderList.flink; while (ExpanderList != &(tmpOnePortContext->discovery.discoveringExpanderList)) { tempExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if (tempExpander == oneExpander) { if (tempExpander != agNULL) { DM_DBG3(("dmFindDiscoveringExpander: match, expander id %d\n", tempExpander->id)); DM_DBG3(("dmFindDiscoveringExpander: exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmFindDiscoveringExpander: exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); } ret = agTRUE; break; } ExpanderList = ExpanderList->flink; } return ret; } osGLOBAL void dmDiscoveringExpanderRemove( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; DM_DBG3(("dmDiscoveringExpanderRemove: start\n")); DM_DBG3(("dmDiscoveringExpanderRemove: expander id %d\n", oneExpander->id)); DM_DBG3(("dmDiscoveringExpanderRemove: exp addrHi 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveringExpanderRemove: exp addrLo 0x%08x\n", oneExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmDiscoveringExpanderRemove: BEFORE\n")); dmDumpAllExp(dmRoot, onePortContext, oneExpander); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); dmDumpAllFreeExp(dmRoot); // if is temporary till smp problem is fixed if (dmFindDiscoveringExpander(dmRoot, onePortContext, oneExpander) == agTRUE) { DM_DBG3(("dmDiscoveringExpanderRemove: oneDeviceData %p did %d\n", oneExpander->dmDevice, oneExpander->dmDevice->id)); DM_DBG3(("dmDiscoveringExpanderRemove: oneExpander %p did %d\n", oneExpander, oneExpander->id)); if (oneExpander != oneExpander->dmDevice->dmExpander) { DM_DBG3(("dmDiscoveringExpanderRemove: before !!! wrong !!!\n")); } oneExpander->underDiscovering = agFALSE; oneExpander->discoveringPhyId = 0; tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_DEQUEUE_THIS(&(oneExpander->linkNode)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); if (onePortContext->discovery.status == DISCOVERY_UP_STREAM) { DM_DBG3(("dmDiscoveringExpanderRemove: DISCOVERY_UP_STREAM\n")); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_ENQUEUE_AT_TAIL(&(oneExpander->upNode), &(onePortContext->discovery.UpdiscoveringExpanderList)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); onePortContext->discovery.NumOfUpExp++; } else { DM_DBG3(("dmDiscoveringExpanderRemove: Status %d\n", onePortContext->discovery.status)); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_ENQUEUE_AT_TAIL(&(oneExpander->linkNode), &(dmAllShared->mainExpanderList)); // DMLIST_ENQUEUE_AT_TAIL(&(oneExpander->linkNode), &(dmAllShared->freeExpanderList)); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } // error checking if (oneExpander != oneExpander->dmDevice->dmExpander) { DM_DBG3(("dmDiscoveringExpanderRemove: after !!! wrong !!!\n")); } } //end temp if else { DM_DBG1(("dmDiscoveringExpanderRemove: !!! problem !!!\n")); } DM_DBG3(("dmDiscoveringExpanderRemove: AFTER\n")); dmDumpAllExp(dmRoot, onePortContext, oneExpander); dmDumpAllUpExp(dmRoot, onePortContext, oneExpander); dmDumpAllFreeExp(dmRoot); return; } /* returns an expander with sasAddrLo, sasAddrHi from dmAllShared->mainExpanderList */ osGLOBAL dmExpander_t * dmExpMainListFind( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 sasAddrHi, bit32 sasAddrLo ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmList_t *ExpanderList; dmExpander_t *tempExpander; DM_DBG3(("dmExpMainListFind: start\n")); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (DMLIST_EMPTY(&(dmAllShared->mainExpanderList))) { DM_DBG1(("dmExpMainListFind: empty mainExpanderList\n")); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); return agNULL; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } ExpanderList = dmAllShared->mainExpanderList.flink; while (ExpanderList != &(dmAllShared->mainExpanderList)) { tempExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if (tempExpander == agNULL) { DM_DBG1(("dmExpMainListFind: tempExpander is NULL!!!\n")); return agNULL; } DM_DBG3(("dmExpMainListFind: expander id %d\n", tempExpander->id)); DM_DBG3(("dmExpMainListFind: exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmExpMainListFind: exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); if ((tempExpander->dmDevice->SASAddressID.sasAddressHi == sasAddrHi) && (tempExpander->dmDevice->SASAddressID.sasAddressLo == sasAddrLo) && (tempExpander->dmDevice->dmPortContext == onePortContext) ) { DM_DBG3(("dmExpMainListFind: found expander id %d\n", tempExpander->id)); DM_DBG3(("dmExpMainListFind: found exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmExpMainListFind: found exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); return tempExpander; } ExpanderList = ExpanderList->flink; } return agNULL; } /* returns an expander with sasAddrLo, sasAddrHi from discoveringExpanderList */ osGLOBAL dmExpander_t * dmExpFind( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 sasAddrHi, bit32 sasAddrLo ) { dmList_t *ExpanderList; dmExpander_t *tempExpander; dmIntPortContext_t *tmpOnePortContext = onePortContext; DM_DBG3(("dmExpFind: start\n")); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.discoveringExpanderList))) { DM_DBG3(("dmExpFind tdsaDumpAllExp: empty discoveringExpanderList\n")); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); return agNULL; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } ExpanderList = tmpOnePortContext->discovery.discoveringExpanderList.flink; while (ExpanderList != &(tmpOnePortContext->discovery.discoveringExpanderList)) { tempExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if (tempExpander == agNULL) { DM_DBG1(("dmExpFind: tempExpander is NULL!!!\n")); return agNULL; } DM_DBG3(("dmExpFind: expander id %d\n", tempExpander->id)); DM_DBG3(("dmExpFind: exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmExpFind: exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); if ((tempExpander->dmDevice->SASAddressID.sasAddressHi == sasAddrHi) && (tempExpander->dmDevice->SASAddressID.sasAddressLo == sasAddrLo) && (tempExpander->dmDevice->dmPortContext == onePortContext) ) { DM_DBG3(("dmExpFind: found\n")); return tempExpander; } ExpanderList = ExpanderList->flink; } return agNULL; } osGLOBAL bit32 dmDiscoverCheck( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { DM_DBG3(("dmDiscoverCheck: start\n")); if (onePortContext == agNULL) { DM_DBG1(("dmDiscoverCheck: onePortContext is NULL!!!\n")); return agTRUE; } if (onePortContext->valid == agFALSE) { DM_DBG1(("dmDiscoverCheck: invalid port!!!\n")); return agTRUE; } if (onePortContext->DiscoveryState == DM_DSTATE_COMPLETED || onePortContext->discovery.status == DISCOVERY_SAS_DONE ) { DM_DBG1(("dmDiscoverCheck: aborted discovery!!!\n")); tddmDiscoverCB( dmRoot, onePortContext->dmPortContext, dmDiscAborted ); return agTRUE; } return agFALSE; } /* ??? needs to handle pending SMPs move from dmAllShared->discoveringExpanderList to dmAllShared->mainExpanderList */ osGLOBAL void dmDiscoverAbort( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { DM_DBG1(("dmDiscoverAbort: start\n")); if (onePortContext->DiscoveryState == DM_DSTATE_COMPLETED || onePortContext->discovery.status == DISCOVERY_SAS_DONE) { DM_DBG1(("dmDiscoverAbort: not allowed case!!! onePortContext->DiscoveryState 0x%x onePortContext->discovery.status 0x%x\n", onePortContext->DiscoveryState, onePortContext->discovery.status)); return; } onePortContext->DiscoveryState = DM_DSTATE_COMPLETED; onePortContext->discovery.status = DISCOVERY_SAS_DONE; /* move from dmAllShared->discoveringExpanderList to dmAllShared->mainExpanderList */ dmCleanAllExp(dmRoot, onePortContext); return; } /* move from dmAllShared->discoveringExpanderList to dmAllShared->mainExpanderList */ osGLOBAL void dmCleanAllExp( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmList_t *ExpanderList; dmExpander_t *tempExpander; dmExpander_t *oneExpander = agNULL; dmIntPortContext_t *tmpOnePortContext = onePortContext; DM_DBG3(("dmCleanAllExp: start\n")); DM_DBG3(("dmCleanAllExp: pid %d\n", onePortContext->id)); DM_DBG3(("dmCleanAllExp: before all clean up\n")); dmDumpAllFreeExp(dmRoot); /* clean up UpdiscoveringExpanderList*/ DM_DBG3(("dmCleanAllExp: clean discoveringExpanderList\n")); if (!DMLIST_EMPTY(&(tmpOnePortContext->discovery.discoveringExpanderList))) { ExpanderList = tmpOnePortContext->discovery.discoveringExpanderList.flink; while (ExpanderList != &(tmpOnePortContext->discovery.discoveringExpanderList)) { tempExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if (tempExpander == agNULL) { DM_DBG1(("dmCleanAllExp: tempExpander is NULL!!!\n")); return; } DM_DBG3(("dmCleanAllExp: exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmCleanAllExp: exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmCleanAllExp: exp id %d\n", tempExpander->id)); oneExpander = dmExpMainListFind(dmRoot, tmpOnePortContext, tempExpander->dmDevice->SASAddressID.sasAddressHi, tempExpander->dmDevice->SASAddressID.sasAddressLo); if (oneExpander == agNULL) { DM_DBG3(("dmCleanAllExp: moving\n")); DM_DBG3(("dmCleanAllExp: moving, exp id %d\n", tempExpander->id)); /* putting back to the free pool */ tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_DEQUEUE_THIS(&(tempExpander->linkNode)); // DMLIST_ENQUEUE_AT_TAIL(&(tempExpander->linkNode), &(dmAllShared->freeExpanderList)); DMLIST_ENQUEUE_AT_TAIL(&(tempExpander->linkNode), &(dmAllShared->mainExpanderList)); if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.discoveringExpanderList))) { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); break; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } ExpanderList = tmpOnePortContext->discovery.discoveringExpanderList.flink; } else { DM_DBG3(("dmCleanAllExp: in mainExpanderList; skippig\n")); ExpanderList = ExpanderList->flink; } } } else { DM_DBG3(("dmCleanAllExp: empty discoveringExpanderList\n")); } /* reset discoveringExpanderList */ DMLIST_INIT_HDR(&(tmpOnePortContext->discovery.discoveringExpanderList)); /* clean up UpdiscoveringExpanderList*/ DM_DBG3(("dmCleanAllExp: clean UpdiscoveringExpanderList\n")); if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.UpdiscoveringExpanderList))) { DM_DBG3(("dmCleanAllExp: empty UpdiscoveringExpanderList\n")); return; } ExpanderList = tmpOnePortContext->discovery.UpdiscoveringExpanderList.flink; while (ExpanderList != &(tmpOnePortContext->discovery.UpdiscoveringExpanderList)) { tempExpander = DMLIST_OBJECT_BASE(dmExpander_t, upNode, ExpanderList); if (tempExpander == agNULL) { DM_DBG1(("dmCleanAllExp: tempExpander is NULL!!!\n")); return; } DM_DBG3(("dmCleanAllExp: exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmCleanAllExp: exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); DM_DBG3(("dmCleanAllExp: exp id %d\n", tempExpander->id)); oneExpander = dmExpMainListFind(dmRoot, tmpOnePortContext, tempExpander->dmDevice->SASAddressID.sasAddressHi, tempExpander->dmDevice->SASAddressID.sasAddressLo); if (oneExpander == agNULL) { DM_DBG3(("dmCleanAllExp: moving\n")); DM_DBG3(("dmCleanAllExp: moving exp id %d\n", tempExpander->id)); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_DEQUEUE_THIS(&(tempExpander->upNode)); DMLIST_ENQUEUE_AT_TAIL(&(tempExpander->linkNode), &(dmAllShared->mainExpanderList)); if (DMLIST_EMPTY(&(tmpOnePortContext->discovery.UpdiscoveringExpanderList))) { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); break; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } ExpanderList = tmpOnePortContext->discovery.UpdiscoveringExpanderList.flink; } else { DM_DBG3(("dmCleanAllExp: in mainExpanderList; skippig\n")); ExpanderList = ExpanderList->flink; } } /* reset UpdiscoveringExpanderList */ DMLIST_INIT_HDR(&(tmpOnePortContext->discovery.UpdiscoveringExpanderList)); DM_DBG3(("dmCleanAllExp: after all clean up\n")); dmDumpAllFreeExp(dmRoot); return; } osGLOBAL void dmInternalRemovals( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG3(("dmInternalRemovals: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmInternalRemovals: empty device list\n")); return; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmInternalRemovals: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmInternalRemovals: loop did %d\n", oneDeviceData->id)); DM_DBG3(("dmInternalRemovals: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmInternalRemovals: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmInternalRemovals: valid %d\n", oneDeviceData->valid)); DM_DBG3(("dmInternalRemovals: valid2 %d\n", oneDeviceData->valid2)); DM_DBG3(("dmInternalRemovals: directlyAttached %d\n", oneDeviceData->directlyAttached)); if ( oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmInternalRemovals: right portcontext pid %d\n", onePortContext->id)); if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_INCREMENTAL_START) { DM_DBG3(("dmInternalRemovals: incremental discovery\n")); oneDeviceData->valid2 = agFALSE; } else { DM_DBG3(("dmInternalRemovals: full discovery\n")); oneDeviceData->valid = agFALSE; } DeviceListList = DeviceListList->flink; } else { if (oneDeviceData->dmPortContext != agNULL) { DM_DBG3(("dmInternalRemovals: different portcontext; oneDeviceData->dmPortContext pid %d oneportcontext pid %d\n", oneDeviceData->dmPortContext->id, onePortContext->id)); } else { DM_DBG3(("dmInternalRemovals: different portcontext; oneDeviceData->dmPortContext pid NULL oneportcontext pid %d\n", onePortContext->id)); } DeviceListList = DeviceListList->flink; } } return; } osGLOBAL void dmDiscoveryResetProcessed( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG3(("dmDiscoveryResetProcessed: start\n")); /* reinitialize the device data belonging to this portcontext */ DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmDiscoveryResetProcessed: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDiscoveryResetProcessed: loop did %d\n", oneDeviceData->id)); if (oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmDiscoveryResetProcessed: resetting procssed flag\n")); oneDeviceData->processed = agFALSE; } DeviceListList = DeviceListList->flink; } return; } /* calls osGLOBAL void tddmDiscoverCB( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext, bit32 eventStatus ) */ osGLOBAL void dmDiscoverDone( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 flag ) { DM_DBG3(("dmDiscoverDone: start\n")); DM_DBG3(("dmDiscoverDone: pid %d\n", onePortContext->id)); /* Set discovery status */ onePortContext->discovery.status = DISCOVERY_SAS_DONE; /* clean up expanders data strucures; move to free exp when device is cleaned */ dmCleanAllExp(dmRoot, onePortContext); dmDumpAllMainExp(dmRoot, onePortContext); dmDiscoveryResetProcessed(dmRoot, onePortContext); dmDiscoveryDumpMCN(dmRoot, onePortContext); if (onePortContext->discovery.SeenBC == agTRUE) { DM_DBG3(("dmDiscoverDone: broadcast change; discover again\n")); dmDiscoveryResetMCN(dmRoot, onePortContext); dmInternalRemovals(dmRoot, onePortContext); /* processed broadcast change */ onePortContext->discovery.SeenBC = agFALSE; if (onePortContext->discovery.ResetTriggerred == agTRUE) { DM_DBG3(("dmDiscoverDone: dmBCTimer\n")); dmBCTimer(dmRoot, onePortContext); } else { dmIncrementalDiscover(dmRoot, onePortContext, agTRUE); } } else { onePortContext->DiscoveryState = DM_DSTATE_COMPLETED; if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_FULL_START) { if (flag == DM_RC_SUCCESS) { dmResetReported(dmRoot, onePortContext ); dmDiscoveryReportMCN(dmRoot, onePortContext ); /* call tddmDiscoverCB() */ tddmDiscoverCB( dmRoot, onePortContext->dmPortContext, dmDiscCompleted ); } else if (flag != DM_RC_SUCCESS || onePortContext->discovery.DeferredError == agTRUE) { onePortContext->DiscoveryState = DM_DSTATE_COMPLETED_WITH_FAILURE; DM_DBG1(("dmDiscoverDone: Error; clean up!!!\n")); dmDiscoveryInvalidateDevices(dmRoot, onePortContext ); tddmDiscoverCB( dmRoot, onePortContext->dmPortContext, dmDiscFailed ); } } else { if (flag == DM_RC_SUCCESS) { dmReportChanges(dmRoot, onePortContext ); dmDiscoveryReportMCN(dmRoot, onePortContext ); tddmDiscoverCB( dmRoot, onePortContext->dmPortContext, dmDiscCompleted ); } else if (flag != DM_RC_SUCCESS || onePortContext->discovery.DeferredError == agTRUE) { onePortContext->DiscoveryState = DM_DSTATE_COMPLETED_WITH_FAILURE; dmDiscoveryInvalidateDevices(dmRoot, onePortContext ); tddmDiscoverCB( dmRoot, onePortContext->dmPortContext, dmDiscFailed ); } } } return; } /* called by dmDiscoveryErrorRemovals() or dmReportRemovals() on discovery failure */ osGLOBAL void dmSubReportRemovals( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData, bit32 flag ) { dmDeviceData_t *oneAttachedExpDeviceData = agNULL; DM_DBG3(("dmSubReportRemovals: start\n")); DM_DBG3(("dmSubReportRemovals: flag 0x%x\n", flag)); if (flag == dmDeviceRemoval) { oneDeviceData->registered = agFALSE; } if (oneDeviceData->ExpDevice != agNULL) { DM_DBG3(("dmSubReportRemovals: attached expander case\n")); oneAttachedExpDeviceData = oneDeviceData->ExpDevice; tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, &oneAttachedExpDeviceData->dmDeviceInfo, flag); } else { DM_DBG3(("dmSubReportRemovals: NO attached expander case\n")); tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, agNULL, flag); } - /* this function is called at the end of discovery; reinitalizes oneDeviceData->reported */ + /* this function is called at the end of discovery; reinitializes oneDeviceData->reported */ oneDeviceData->reported = agFALSE; return; } /* called by dmReportChanges() on discovery success */ osGLOBAL void dmSubReportChanges( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData, bit32 flag ) { dmDeviceData_t *oneAttachedExpDeviceData = agNULL; DM_DBG3(("dmSubReportChanges: start\n")); DM_DBG3(("dmSubReportChanges: flag 0x%x\n", flag)); if (flag == dmDeviceRemoval) { oneDeviceData->registered = agFALSE; } if (oneDeviceData->reported == agFALSE) { if (oneDeviceData->ExpDevice != agNULL) { DM_DBG3(("dmSubReportChanges: attached expander case\n")); oneAttachedExpDeviceData = oneDeviceData->ExpDevice; tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, &oneAttachedExpDeviceData->dmDeviceInfo, flag); } else { DM_DBG3(("dmSubReportChanges: NO attached expander case\n")); tddmReportDevice(dmRoot, onePortContext->dmPortContext, &oneDeviceData->dmDeviceInfo, agNULL, flag); } } else { DM_DBG3(("dmSubReportChanges: skip; been reported\n")); } - /* this function is called at the end of discovery; reinitalizes oneDeviceData->reported */ + /* this function is called at the end of discovery; reinitializes oneDeviceData->reported */ oneDeviceData->reported = agFALSE; return; } /* should add or remove be reported per device??? */ osGLOBAL void dmReportChanges( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit32 added = agFALSE, removed = agFALSE; // dmDeviceData_t *oneAttachedExpDeviceData = agNULL; DM_DBG3(("dmReportChanges: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmReportChanges: empty device list\n")); return; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmReportChanges: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmReportChanges: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmReportChanges: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); if ( oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmReportChanges: right portcontext\n")); if (oneDeviceData->SASAddressID.sasAddressHi == onePortContext->sasRemoteAddressHi && oneDeviceData->SASAddressID.sasAddressLo == onePortContext->sasRemoteAddressLo ) { DM_DBG1(("dmReportChanges: keep, not reporting did 0x%x\n", oneDeviceData->id)); oneDeviceData->valid = agTRUE; oneDeviceData->valid2 = agFALSE; } else if ( (oneDeviceData->valid == agTRUE) && (oneDeviceData->valid2 == agTRUE) ) { DM_DBG3(("dmReportChanges: same\n")); /* reset valid bit */ oneDeviceData->valid = oneDeviceData->valid2; oneDeviceData->valid2 = agFALSE; dmSubReportChanges(dmRoot, onePortContext, oneDeviceData, dmDeviceNoChange); } else if ( (oneDeviceData->valid == agTRUE) && (oneDeviceData->valid2 == agFALSE) ) { DM_DBG3(("dmReportChanges: removed\n")); removed = agTRUE; /* reset valid bit */ oneDeviceData->valid = oneDeviceData->valid2; oneDeviceData->valid2 = agFALSE; onePortContext->RegisteredDevNums--; dmSubReportChanges(dmRoot, onePortContext, oneDeviceData, dmDeviceRemoval); } else if ( (oneDeviceData->valid == agFALSE) && (oneDeviceData->valid2 == agTRUE) ) { DM_DBG3(("dmReportChanges: added\n")); added = agTRUE; /* reset valid bit */ oneDeviceData->valid = oneDeviceData->valid2; oneDeviceData->valid2 = agFALSE; dmSubReportChanges(dmRoot, onePortContext, oneDeviceData, dmDeviceArrival); } else { DM_DBG3(("dmReportChanges: else\n")); } } else { DM_DBG3(("dmReportChanges: different portcontext\n")); } DeviceListList = DeviceListList->flink; } /* osGLOBAL void tddmReportDevice( dmRoot_t *dmRoot, dmPortContext_t *dmPortContext, dmDeviceInfo_t *dmDeviceInfo, dmDeviceInfo_t *dmExpDeviceInfo, bit32 flag ) */ /* arrival or removal at once */ if (added == agTRUE) { DM_DBG3(("dmReportChanges: added at the end\n")); #if 0 /* TBD */ ostiInitiatorEvent( tiRoot, onePortContext->tiPortalContext, agNULL, tiIntrEventTypeDeviceChange, tiDeviceArrival, agNULL ); #endif } if (removed == agTRUE) { DM_DBG3(("dmReportChanges: removed at the end\n")); #if 0 /* TBD */ ostiInitiatorEvent( tiRoot, onePortContext->tiPortalContext, agNULL, tiIntrEventTypeDeviceChange, tiDeviceRemoval, agNULL ); #endif } if (onePortContext->discovery.forcedOK == agTRUE && added == agFALSE && removed == agFALSE) { DM_DBG3(("dmReportChanges: missed chance to report. forced to report OK\n")); onePortContext->discovery.forcedOK = agFALSE; #if 0 /* TBD */ ostiInitiatorEvent( tiRoot, onePortContext->tiPortalContext, agNULL, tiIntrEventTypeDiscovery, tiDiscOK, agNULL ); #endif } if (added == agFALSE && removed == agFALSE) { DM_DBG3(("dmReportChanges: the same\n")); } return; } osGLOBAL void dmReportRemovals( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 flag ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit32 removed = agFALSE; DM_DBG1(("dmReportRemovals: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmReportRemovals: empty device list\n")); return; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmReportRemovals: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmReportRemovals: loop did %d\n", oneDeviceData->id)); DM_DBG3(("dmReportRemovals: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmReportRemovals: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmReportRemovals: valid %d\n", oneDeviceData->valid)); DM_DBG3(("dmReportRemovals: valid2 %d\n", oneDeviceData->valid2)); DM_DBG3(("dmReportRemovals: directlyAttached %d\n", oneDeviceData->directlyAttached)); if ( oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmReportRemovals: right portcontext pid %d\n", onePortContext->id)); if (oneDeviceData->SASAddressID.sasAddressHi == onePortContext->sasRemoteAddressHi && oneDeviceData->SASAddressID.sasAddressLo == onePortContext->sasRemoteAddressLo ) { DM_DBG1(("dmReportRemovals: keeping\n")); oneDeviceData->valid = agTRUE; oneDeviceData->valid2 = agFALSE; } else if (oneDeviceData->valid == agTRUE) { DM_DBG3(("dmReportRemovals: removing\n")); /* notify only reported devices to OS layer*/ if ( DEVICE_IS_SSP_TARGET(oneDeviceData) || DEVICE_IS_STP_TARGET(oneDeviceData) || DEVICE_IS_SATA_DEVICE(oneDeviceData) ) { removed = agTRUE; } /* all targets except expanders */ DM_DBG3(("dmReportRemovals: did %d\n", oneDeviceData->id)); DM_DBG3(("dmReportRemovals: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmReportRemovals: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); onePortContext->RegisteredDevNums--; dmSubReportRemovals(dmRoot, onePortContext, oneDeviceData, dmDeviceRemoval); /* reset valid bit */ oneDeviceData->valid = agFALSE; oneDeviceData->valid2 = agFALSE; } /* called by port invalid case */ if (flag == agTRUE) { oneDeviceData->dmPortContext = agNULL; } DeviceListList = DeviceListList->flink; } else { if (oneDeviceData->dmPortContext != agNULL) { DM_DBG3(("dmReportRemovals: different portcontext; oneDeviceData->dmPortContext pid %d oneportcontext pid %d\n", oneDeviceData->dmPortContext->id, onePortContext->id)); } else { DM_DBG3(("dmReportRemovals: different portcontext; oneDeviceData->dmPortContext pid NULL oneportcontext pid %d\n", onePortContext->id)); } DeviceListList = DeviceListList->flink; } } if (removed == agTRUE) { DM_DBG3(("dmReportRemovals: removed at the end\n")); #if 0 /* TBD */ ostiInitiatorEvent( tiRoot, onePortContext->tiPortalContext, agNULL, tiIntrEventTypeDeviceChange, tiDeviceRemoval, agNULL ); #endif } return; } osGLOBAL void dmResetReported( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG3(("dmResetReported: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmResetReported: empty device list\n")); return; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmResetReported: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmResetReported: loop did %d\n", oneDeviceData->id)); DM_DBG3(("dmResetReported: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmResetReported: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmResetReported: valid %d\n", oneDeviceData->valid)); DM_DBG3(("dmResetReported: valid2 %d\n", oneDeviceData->valid2)); DM_DBG3(("dmResetReported: directlyAttached %d\n", oneDeviceData->directlyAttached)); if ( oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmResetReported: right portcontext pid %d\n", onePortContext->id)); oneDeviceData->reported = agFALSE; DeviceListList = DeviceListList->flink; } else { if (oneDeviceData->dmPortContext != agNULL) { DM_DBG3(("dmResetReported: different portcontext; oneDeviceData->dmPortContext pid %d oneportcontext pid %d\n", oneDeviceData->dmPortContext->id, onePortContext->id)); } else { DM_DBG3(("dmResetReported: different portcontext; oneDeviceData->dmPortContext pid NULL oneportcontext pid %d\n", onePortContext->id)); } DeviceListList = DeviceListList->flink; } } return; } /* called on discover failure */ osGLOBAL void dmDiscoveryInvalidateDevices( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG1(("dmDiscoveryInvalidateDevices: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmDiscoveryInvalidateDevices: empty device list\n")); return; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmDiscoveryInvalidateDevices: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDiscoveryInvalidateDevices: loop did %d\n", oneDeviceData->id)); DM_DBG3(("dmDiscoveryInvalidateDevices: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveryInvalidateDevices: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmDiscoveryInvalidateDevices: valid %d\n", oneDeviceData->valid)); DM_DBG3(("dmDiscoveryInvalidateDevices: valid2 %d\n", oneDeviceData->valid2)); DM_DBG3(("dmDiscoveryInvalidateDevices: directlyAttached %d\n", oneDeviceData->directlyAttached)); if ( oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmDiscoveryInvalidateDevices: right portcontext pid %d\n", onePortContext->id)); if (oneDeviceData->SASAddressID.sasAddressHi == onePortContext->sasRemoteAddressHi && oneDeviceData->SASAddressID.sasAddressLo == onePortContext->sasRemoteAddressLo ) { DM_DBG1(("dmDiscoveryInvalidateDevices: keeping\n")); oneDeviceData->valid = agTRUE; oneDeviceData->valid2 = agFALSE; } else { oneDeviceData->valid = agFALSE; oneDeviceData->valid2 = agFALSE; oneDeviceData->registered = agFALSE; oneDeviceData->reported = agFALSE; /* all targets other than expanders */ DM_DBG3(("dmDiscoveryInvalidateDevices: did %d\n", oneDeviceData->id)); DM_DBG3(("dmDiscoveryInvalidateDevices: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveryInvalidateDevices: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); onePortContext->RegisteredDevNums--; } DeviceListList = DeviceListList->flink; } else { if (oneDeviceData->dmPortContext != agNULL) { DM_DBG3(("dmDiscoveryInvalidateDevices: different portcontext; oneDeviceData->dmPortContext pid %d oneportcontext pid %d\n", oneDeviceData->dmPortContext->id, onePortContext->id)); } else { DM_DBG3(("dmDiscoveryInvalidateDevices: different portcontext; oneDeviceData->dmPortContext pid NULL oneportcontext pid %d\n", onePortContext->id)); } DeviceListList = DeviceListList->flink; } } return; } /* should DM report the device removal to TDM on an error case? or DM simply removes the devices For now, the second option. */ osGLOBAL void dmDiscoveryErrorRemovals( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG1(("dmDiscoveryErrorRemovals: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmDiscoveryErrorRemovals: empty device list\n")); return; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmDiscoveryErrorRemovals: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDiscoveryErrorRemovals: loop did %d\n", oneDeviceData->id)); DM_DBG3(("dmDiscoveryErrorRemovals: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveryErrorRemovals: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); DM_DBG3(("dmDiscoveryErrorRemovals: valid %d\n", oneDeviceData->valid)); DM_DBG3(("dmDiscoveryErrorRemovals: valid2 %d\n", oneDeviceData->valid2)); DM_DBG3(("dmDiscoveryErrorRemovals: directlyAttached %d\n", oneDeviceData->directlyAttached)); if ( oneDeviceData->dmPortContext == onePortContext) { DM_DBG3(("dmDiscoveryErrorRemovals: right portcontext pid %d\n", onePortContext->id)); if (oneDeviceData->SASAddressID.sasAddressHi == onePortContext->sasRemoteAddressHi && oneDeviceData->SASAddressID.sasAddressLo == onePortContext->sasRemoteAddressLo ) { DM_DBG1(("dmDiscoveryErrorRemovals: keeping\n")); oneDeviceData->valid = agTRUE; oneDeviceData->valid2 = agFALSE; } else { oneDeviceData->valid = agFALSE; oneDeviceData->valid2 = agFALSE; /* all targets other than expanders */ DM_DBG3(("dmDiscoveryErrorRemovals: did %d\n", oneDeviceData->id)); DM_DBG3(("dmDiscoveryErrorRemovals: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveryErrorRemovals: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); onePortContext->RegisteredDevNums--; dmSubReportRemovals(dmRoot, onePortContext, oneDeviceData, dmDeviceRemoval); } DeviceListList = DeviceListList->flink; } else { if (oneDeviceData->dmPortContext != agNULL) { DM_DBG3(("dmDiscoveryErrorRemovals: different portcontext; oneDeviceData->dmPortContext pid %d oneportcontext pid %d\n", oneDeviceData->dmPortContext->id, onePortContext->id)); } else { DM_DBG3(("dmDiscoveryErrorRemovals: different portcontext; oneDeviceData->dmPortContext pid NULL oneportcontext pid %d\n", onePortContext->id)); } DeviceListList = DeviceListList->flink; } } return; } /* move from dmAllShared->mainExpanderList to dmAllShared->freeExpanderList */ osGLOBAL void dmDiscoveryExpanderCleanUp( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmExpander_t *oneExpander = agNULL; dmList_t *ExpanderList = agNULL; dmDeviceData_t *oneDeviceData = agNULL; DM_DBG3(("dmDiscoveryExpanderCleanUp: start\n")); /* be sure to call osGLOBAL void dmExpanderDeviceDataReInit( dmRoot_t *dmRoot, dmExpander_t *oneExpander ); */ tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (!DMLIST_EMPTY(&(dmAllShared->mainExpanderList))) { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); ExpanderList = dmAllShared->mainExpanderList.flink; while (ExpanderList != &(dmAllShared->mainExpanderList)) { oneExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if (oneExpander == agNULL) { DM_DBG1(("dmDiscoveryExpanderCleanUp: oneExpander is NULL!!!\n")); return; } oneDeviceData = oneExpander->dmDevice; DM_DBG3(("dmDiscoveryExpanderCleanUp: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveryExpanderCleanUp: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); if ( oneDeviceData->dmPortContext == onePortContext) { dmExpanderDeviceDataReInit(dmRoot, oneExpander); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); DMLIST_DEQUEUE_THIS(&(oneExpander->linkNode)); DMLIST_ENQUEUE_AT_TAIL(&(oneExpander->linkNode), &(dmAllShared->freeExpanderList)); if (DMLIST_EMPTY(&(dmAllShared->mainExpanderList))) { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); break; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } ExpanderList = dmAllShared->mainExpanderList.flink; } else { ExpanderList = ExpanderList->flink; } } } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); DM_DBG3(("dmDiscoveryExpanderCleanUp: empty mainExpanderList\n")); } return; } /* moves all devices from dmAllShared->MainDeviceList to dmAllShared->FreeDeviceList */ osGLOBAL void dmDiscoveryDeviceCleanUp( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; DM_DBG3(("dmDiscoveryDeviceCleanUp: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (!DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmDiscoveryDeviceCleanUp: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDiscoveryDeviceCleanUp: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDiscoveryDeviceCleanUp: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); if ( oneDeviceData->dmPortContext == onePortContext) { dmDeviceDataReInit(dmRoot, oneDeviceData); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); DMLIST_DEQUEUE_THIS(&(oneDeviceData->MainLink)); DMLIST_ENQUEUE_AT_TAIL(&(oneDeviceData->FreeLink), &(dmAllShared->FreeDeviceList)); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); break; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } onePortContext->RegisteredDevNums--; DeviceListList = dmAllShared->MainDeviceList.flink; } else { DeviceListList = DeviceListList->flink; } } } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmDiscoveryDeviceCleanUp: empty MainDeviceList\n")); } return; } osGLOBAL void dmDumpAllExp( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { DM_DBG3(("dmDumpAllExp: start\n")); return; } osGLOBAL void dmDumpAllUpExp( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander ) { DM_DBG3(("dmDumpAllUpExp: start\n")); return; } osGLOBAL void dmDumpAllFreeExp( dmRoot_t *dmRoot ) { DM_DBG3(("dmDumpAllFreeExp: start\n")); return; } osGLOBAL void dmDumpAllMainExp( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmList_t *ExpanderList; dmExpander_t *tempExpander; DM_DBG3(("dmDumpAllMainExp: start\n")); tddmSingleThreadedEnter(dmRoot, DM_EXPANDER_LOCK); if (DMLIST_EMPTY(&(dmAllShared->mainExpanderList))) { DM_DBG3(("dmDumpAllMainExp: empty discoveringExpanderList\n")); tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); return; } else { tddmSingleThreadedLeave(dmRoot, DM_EXPANDER_LOCK); } ExpanderList = dmAllShared->mainExpanderList.flink; while (ExpanderList != &(dmAllShared->mainExpanderList)) { tempExpander = DMLIST_OBJECT_BASE(dmExpander_t, linkNode, ExpanderList); if (tempExpander == agNULL) { DM_DBG1(("dmDumpAllMainExp: tempExpander is NULL!!!\n")); return; } DM_DBG3(("dmDumpAllMainExp: expander id %d\n", tempExpander->id)); DM_DBG3(("dmDumpAllMainExp: exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDumpAllMainExp: exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); if ((tempExpander->dmDevice->dmPortContext == onePortContext) ) { DM_DBG3(("dmDumpAllMainExp: found expander id %d\n", tempExpander->id)); DM_DBG3(("dmDumpAllMainExp: found exp addrHi 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressHi)); DM_DBG3(("dmDumpAllMainExp: found exp addrLo 0x%08x\n", tempExpander->dmDevice->SASAddressID.sasAddressLo)); } ExpanderList = ExpanderList->flink; } return; } osGLOBAL void dmDumpAllMainDevice( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit32 total = 0, port_total = 0; DM_DBG3(("dmDumpAllMainDevice: start\n")); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (DMLIST_EMPTY(&(dmAllShared->MainDeviceList))) { DM_DBG3(("dmDumpAllMainDevice: empty discoveringExpanderList\n")); tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); return; } else { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); } DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG3(("dmDumpAllMainDevice: oneDeviceData is NULL!!!\n")); return; } DM_DBG3(("dmDumpAllMainDevice: oneDeviceData id %d\n", oneDeviceData->id)); DM_DBG3(("dmDumpAllMainDevice: addrHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDumpAllMainDevice: addrLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); total++; if ((oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmDumpAllMainDevice: found oneDeviceData id %d\n", oneDeviceData->id)); DM_DBG3(("dmDumpAllMainDevice: found addrHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDumpAllMainDevice: found addrLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); port_total++; } DeviceListList = DeviceListList->flink; } DM_DBG3(("dmDumpAllMainDevice: total %d port_totaol %d\n", total, port_total)); return; } osGLOBAL dmDeviceData_t * dmAddSASToSharedcontext( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmSASSubID_t *dmSASSubID, dmDeviceData_t *oneExpDeviceData, bit8 phyID ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit32 new_device = agTRUE; DM_DBG3(("dmAddSASToSharedcontext: start\n")); DM_DBG3(("dmAddSASToSharedcontext: oneportContext ID %d\n", onePortContext->id)); if (oneExpDeviceData != agNULL) { DM_DBG3(("dmAddSASToSharedcontext: oneExpDeviceData sasAddressHi 0x%08x sasAddressLo 0x%08x\n", oneExpDeviceData->SASAddressID.sasAddressHi, oneExpDeviceData->SASAddressID.sasAddressLo)); } else { DM_DBG3(("dmAddSASToSharedcontext: oneExpDeviceData is NULL\n")); } /* find a device's existence */ DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmAddSASToSharedcontext: oneDeviceData is NULL!!!\n")); return agNULL; } if ((oneDeviceData->SASAddressID.sasAddressHi == dmSASSubID->sasAddressHi) && (oneDeviceData->SASAddressID.sasAddressLo == dmSASSubID->sasAddressLo) && (oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmAddSASToSharedcontext: pid %d did %d\n", onePortContext->id, oneDeviceData->id)); new_device = agFALSE; break; } DeviceListList = DeviceListList->flink; } /* new device */ if (new_device == agTRUE) { DM_DBG3(("dmAddSASToSharedcontext: new device\n")); DM_DBG3(("dmAddSASToSharedcontext: sasAddressHi 0x%08x sasAddressLo 0x%08x\n", dmSASSubID->sasAddressHi, dmSASSubID->sasAddressLo)); tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); if (!DMLIST_NOT_EMPTY(&(dmAllShared->FreeDeviceList))) { tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG1(("dmAddSASToSharedcontext: empty DeviceData FreeLink\n")); dmDumpAllMainDevice(dmRoot, onePortContext); return agNULL; } DMLIST_DEQUEUE_FROM_HEAD(&DeviceListList, &(dmAllShared->FreeDeviceList)); tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, FreeLink, DeviceListList); if (oneDeviceData != agNULL) { DM_DBG3(("dmAddSASToSharedcontext: oneDeviceData %p pid %d did %d\n", oneDeviceData, onePortContext->id, oneDeviceData->id)); onePortContext->Count++; oneDeviceData->dmRoot = dmRoot; /* saving sas address */ oneDeviceData->SASAddressID.sasAddressLo = dmSASSubID->sasAddressLo; oneDeviceData->SASAddressID.sasAddressHi = dmSASSubID->sasAddressHi; oneDeviceData->initiator_ssp_stp_smp = dmSASSubID->initiator_ssp_stp_smp; oneDeviceData->target_ssp_stp_smp = dmSASSubID->target_ssp_stp_smp; oneDeviceData->dmPortContext = onePortContext; /* handles both SAS target and STP-target, SATA-device */ if (!DEVICE_IS_SATA_DEVICE(oneDeviceData) && !DEVICE_IS_STP_TARGET(oneDeviceData)) { oneDeviceData->DeviceType = DM_SAS_DEVICE; } else { oneDeviceData->DeviceType = DM_SATA_DEVICE; } if (oneExpDeviceData != agNULL) { oneDeviceData->ExpDevice = oneExpDeviceData; } /* set phyID only when it has initial value of 0xFF */ if (oneDeviceData->phyID == 0xFF) { oneDeviceData->phyID = phyID; } /* incremental discovery */ /* add device to incremental-related link. Report using this link when incremental discovery is done */ if (onePortContext->DiscoveryState == DM_DSTATE_NOT_STARTED) { DM_DBG3(("dmAddSASToSharedcontext: DM_DSTATE_NOT_STARTED\n")); DM_DBG3(("dmAddSASToSharedcontext: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmAddSASToSharedcontext: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); oneDeviceData->valid = agTRUE; } else { if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_INCREMENTAL_START) { DM_DBG3(("dmAddSASToSharedcontext: incremental discovery\n")); DM_DBG3(("dmAddSASToSharedcontext: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmAddSASToSharedcontext: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); oneDeviceData->valid2 = agTRUE; } else { DM_DBG3(("dmAddSASToSharedcontext: full discovery\n")); DM_DBG3(("dmAddSASToSharedcontext: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmAddSASToSharedcontext: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); oneDeviceData->valid = agTRUE; } } /* add the devicedata to the portcontext */ tddmSingleThreadedEnter(dmRoot, DM_DEVICE_LOCK); DMLIST_ENQUEUE_AT_TAIL(&(oneDeviceData->MainLink), &(dmAllShared->MainDeviceList)); tddmSingleThreadedLeave(dmRoot, DM_DEVICE_LOCK); DM_DBG3(("dmAddSASToSharedcontext: one case pid %d did %d \n", onePortContext->id, oneDeviceData->id)); DM_DBG3(("dmAddSASToSharedcontext: new case pid %d did %d phyID %d\n", onePortContext->id, oneDeviceData->id, oneDeviceData->phyID)); } } else /* old device */ { DM_DBG3(("dmAddSASToSharedcontext: old device\n")); DM_DBG3(("dmAddSASToSharedcontext: oneDeviceData %p did %d\n", oneDeviceData, oneDeviceData->id)); DM_DBG3(("dmAddSASToSharedcontext: sasAddressHi 0x%08x sasAddressLo 0x%08x\n", dmSASSubID->sasAddressHi, dmSASSubID->sasAddressLo)); oneDeviceData->dmRoot = dmRoot; /* saving sas address */ oneDeviceData->SASAddressID.sasAddressLo = dmSASSubID->sasAddressLo; oneDeviceData->SASAddressID.sasAddressHi = dmSASSubID->sasAddressHi; oneDeviceData->initiator_ssp_stp_smp = dmSASSubID->initiator_ssp_stp_smp; oneDeviceData->target_ssp_stp_smp = dmSASSubID->target_ssp_stp_smp; oneDeviceData->dmPortContext = onePortContext; /* handles both SAS target and STP-target, SATA-device */ if (!DEVICE_IS_SATA_DEVICE(oneDeviceData) && !DEVICE_IS_STP_TARGET(oneDeviceData)) { oneDeviceData->DeviceType = DM_SAS_DEVICE; } else { oneDeviceData->DeviceType = DM_SATA_DEVICE; } if (oneExpDeviceData != agNULL) { oneDeviceData->ExpDevice = oneExpDeviceData; } /* set phyID only when it has initial value of 0xFF */ if (oneDeviceData->phyID == 0xFF) { oneDeviceData->phyID = phyID; } if (onePortContext->DiscoveryState == DM_DSTATE_NOT_STARTED) { DM_DBG3(("dmAddSASToSharedcontext: DM_DSTATE_NOT_STARTED\n")); DM_DBG3(("dmAddSASToSharedcontext: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmAddSASToSharedcontext: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); oneDeviceData->valid = agTRUE; } else { if (onePortContext->discovery.type == DM_DISCOVERY_OPTION_INCREMENTAL_START) { DM_DBG3(("dmAddSASToSharedcontext: incremental discovery\n")); DM_DBG3(("dmAddSASToSharedcontext: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmAddSASToSharedcontext: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); oneDeviceData->valid2 = agTRUE; } else { DM_DBG3(("dmAddSASToSharedcontext: full discovery\n")); DM_DBG3(("dmAddSASToSharedcontext: sasAddrHi 0x%08x \n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmAddSASToSharedcontext: sasAddrLo 0x%08x \n", oneDeviceData->SASAddressID.sasAddressLo)); oneDeviceData->valid = agTRUE; } } DM_DBG3(("dmAddSASToSharedcontext: old case pid %d did %d phyID %d\n", onePortContext->id, oneDeviceData->id, oneDeviceData->phyID)); } return oneDeviceData; } /* no checking of valid and valid2 */ osGLOBAL dmDeviceData_t * dmDeviceFind( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 sasAddrHi, bit32 sasAddrLo ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDeviceData_t *oneDeviceData = agNULL; dmList_t *DeviceListList; bit32 found = agFALSE; DM_DBG3(("dmDeviceFind: start\n")); /* find a device's existence */ DeviceListList = dmAllShared->MainDeviceList.flink; while (DeviceListList != &(dmAllShared->MainDeviceList)) { oneDeviceData = DMLIST_OBJECT_BASE(dmDeviceData_t, MainLink, DeviceListList); if (oneDeviceData == agNULL) { DM_DBG1(("dmDeviceFind: oneDeviceData is NULL!!!\n")); return agNULL; } if ((oneDeviceData->SASAddressID.sasAddressHi == sasAddrHi) && (oneDeviceData->SASAddressID.sasAddressLo == sasAddrLo) && // (oneDeviceData->valid == agTRUE) && (oneDeviceData->dmPortContext == onePortContext) ) { DM_DBG3(("dmDeviceFind: Found pid %d did %d\n", onePortContext->id, oneDeviceData->id)); DM_DBG3(("dmDeviceFind: sasAddressHi 0x%08x\n", oneDeviceData->SASAddressID.sasAddressHi)); DM_DBG3(("dmDeviceFind: sasAddressLo 0x%08x\n", oneDeviceData->SASAddressID.sasAddressLo)); found = agTRUE; break; } DeviceListList = DeviceListList->flink; } if (found == agFALSE) { DM_DBG3(("dmDeviceFind: end returning NULL\n")); return agNULL; } else { DM_DBG3(("dmDeviceFind: end returning NOT NULL\n")); return oneDeviceData; } } osGLOBAL void dmBCTimer( dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDiscovery_t *discovery; DM_DBG3(("dmBCTimer: start\n")); discovery = &(onePortContext->discovery); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->BCTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->BCTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } if (onePortContext->valid == agTRUE) { dmSetTimerRequest( dmRoot, &discovery->BCTimer, BC_TIMER_VALUE/dmAllShared->usecsPerTick, dmBCTimerCB, onePortContext, agNULL, agNULL ); dmAddTimer( dmRoot, &dmAllShared->timerlist, &discovery->BCTimer ); } return; } osGLOBAL void dmBCTimerCB( dmRoot_t * dmRoot, void * timerData1, void * timerData2, void * timerData3 ) { dmIntPortContext_t *onePortContext; dmDiscovery_t *discovery; DM_DBG3(("dmBCTimerCB: start\n")); onePortContext = (dmIntPortContext_t *)timerData1; discovery = &(onePortContext->discovery); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->BCTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->BCTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } discovery->ResetTriggerred = agFALSE; if (onePortContext->valid == agTRUE) { dmDiscover(dmRoot, onePortContext->dmPortContext, DM_DISCOVERY_OPTION_INCREMENTAL_START ); } return; } /* discovery related SMP timers */ osGLOBAL void dmDiscoverySMPTimer(dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, bit32 functionCode, dmSMPRequestBody_t *dmSMPRequestBody ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDiscovery_t *discovery; DM_DBG3(("dmDiscoverySMPTimer: start\n")); DM_DBG3(("dmDiscoverySMPTimer: pid %d SMPFn 0x%x\n", onePortContext->id, functionCode)); /* start the SMP timer which works as SMP application timer */ discovery = &(onePortContext->discovery); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->DiscoverySMPTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->DiscoverySMPTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } dmSetTimerRequest( dmRoot, &discovery->DiscoverySMPTimer, SMP_TIMER_VALUE/dmAllShared->usecsPerTick, dmDiscoverySMPTimerCB, onePortContext, dmSMPRequestBody, agNULL ); dmAddTimer ( dmRoot, &dmAllShared->timerlist, &discovery->DiscoverySMPTimer ); return; } osGLOBAL void dmDiscoverySMPTimerCB( dmRoot_t * dmRoot, void * timerData1, void * timerData2, void * timerData3 ) { agsaRoot_t *agRoot; dmIntPortContext_t *onePortContext; bit8 SMPFunction; #ifndef DIRECT_SMP dmSMPFrameHeader_t *dmSMPFrameHeader; bit8 smpHeader[4]; #endif dmSMPRequestBody_t *dmSMPRequestBody; dmDiscovery_t *discovery; dmDeviceData_t *oneDeviceData; agsaIORequest_t *agAbortIORequest = agNULL; agsaIORequest_t *agToBeAbortIORequest = agNULL; dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmExpander_t *oneExpander = agNULL; dmSMPRequestBody_t *dmAbortSMPRequestBody = agNULL; dmList_t *SMPList; DM_DBG1(("dmDiscoverySMPTimerCB: start!!!\n")); onePortContext = (dmIntPortContext_t *)timerData1; dmSMPRequestBody = (dmSMPRequestBody_t *)timerData2; discovery = &(onePortContext->discovery); oneDeviceData = dmSMPRequestBody->dmDevice; agToBeAbortIORequest = &(dmSMPRequestBody->agIORequest); agRoot = dmAllShared->agRoot; #ifdef DIRECT_SMP SMPFunction = dmSMPRequestBody->smpPayload[1]; #else saFrameReadBlock(agRoot, dmSMPRequestBody->IndirectSMP, 0, smpHeader, 4); dmSMPFrameHeader = (dmSMPFrameHeader_t *)smpHeader; SMPFunction = dmSMPFrameHeader->smpFunction; #endif DM_DBG3(("dmDiscoverySMPTimerCB: SMP function 0x%x\n", SMPFunction)); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->DiscoverySMPTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->DiscoverySMPTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } //for debugging // saGetPendingPICI(agRoot); switch (SMPFunction) { case SMP_REPORT_GENERAL: /* fall through */ case SMP_DISCOVER: /* fall through */ case SMP_CONFIGURE_ROUTING_INFORMATION: /* fall through */ DM_DBG1(("dmDiscoverySMPTimerCB: failing discovery, SMP function 0x%x !!!\n", SMPFunction)); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; /* no more things to do */ case SMP_REPORT_PHY_SATA: DM_DBG1(("dmDiscoverySMPTimerCB: failing discovery, SMP function SMP_REPORT_PHY_SATA !!!\n")); dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); break; default: /* do nothing */ DM_DBG1(("dmDiscoverySMPTimerCB: Error, not allowed case!!!\n")); break; } if (oneDeviceData->registered == agTRUE && (oneDeviceData->valid == agTRUE || oneDeviceData->valid2 == agTRUE) ) { /* call to saSMPAbort(one) */ /* get an smp REQUEST from the free list */ tddmSingleThreadedEnter(dmRoot, DM_SMP_LOCK); if (DMLIST_EMPTY(&(dmAllShared->freeSMPList))) { DM_DBG1(("dmDiscoverySMPTimerCB: no free SMP, can't abort SMP!!!\n")); tddmSingleThreadedLeave(dmRoot, DM_SMP_LOCK); return; } else { DMLIST_DEQUEUE_FROM_HEAD(&SMPList, &(dmAllShared->freeSMPList)); tddmSingleThreadedLeave(dmRoot, DM_SMP_LOCK); dmAbortSMPRequestBody = DMLIST_OBJECT_BASE(dmSMPRequestBody_t, Link, SMPList); if (dmAbortSMPRequestBody == agNULL) { DM_DBG1(("dmDiscoverySMPTimerCB: dmAbortSMPRequestBody is NULL!!!\n")); return; } DM_DBG5(("dmDiscoverySMPTimerCB: SMP id %d\n", dmAbortSMPRequestBody->id)); } dmAbortSMPRequestBody->dmRoot = dmRoot; agAbortIORequest = &(dmAbortSMPRequestBody->agIORequest); agAbortIORequest->osData = (void *) dmAbortSMPRequestBody; agAbortIORequest->sdkData = agNULL; /* SALL takes care of this */ oneExpander = oneDeviceData->dmExpander; DM_DBG1(("dmDiscoverySMPTimerCB: calling saSMPAbort!!!\n")); saSMPAbort(agRoot, agAbortIORequest, 0, oneExpander->agDevHandle, 0, /* abort one */ agToBeAbortIORequest, dmSMPAbortCB ); } return; } osGLOBAL void dmSMPBusyTimer(dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData, dmSMPRequestBody_t *dmSMPRequestBody ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDiscovery_t *discovery; DM_DBG3(("dmSMPBusyTimer: start\n")); DM_DBG3(("dmSMPBusyTimer: pid %d\n", onePortContext->id)); discovery = &(onePortContext->discovery); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->SMPBusyTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->SMPBusyTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } dmSetTimerRequest( dmRoot, &discovery->SMPBusyTimer, SMP_BUSY_TIMER_VALUE/dmAllShared->usecsPerTick, dmSMPBusyTimerCB, onePortContext, oneDeviceData, dmSMPRequestBody ); dmAddTimer ( dmRoot, &dmAllShared->timerlist, &discovery->SMPBusyTimer ); return; } osGLOBAL void dmSMPBusyTimerCB( dmRoot_t * dmRoot, void * timerData1, void * timerData2, void * timerData3 ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; agsaRoot_t *agRoot; dmIntPortContext_t *onePortContext; dmDeviceData_t *oneDeviceData; dmSMPRequestBody_t *dmSMPRequestBody; agsaSASRequestBody_t *agSASRequestBody; agsaIORequest_t *agIORequest; agsaDevHandle_t *agDevHandle; dmDiscovery_t *discovery; bit32 status = AGSA_RC_FAILURE; dmExpander_t *oneExpander = agNULL; DM_DBG3(("dmSMPBusyTimerCB: start\n")); onePortContext = (dmIntPortContext_t *)timerData1; oneDeviceData = (dmDeviceData_t *)timerData2; dmSMPRequestBody = (dmSMPRequestBody_t *)timerData3; agRoot = dmAllShared->agRoot; agIORequest = &(dmSMPRequestBody->agIORequest); oneExpander = oneDeviceData->dmExpander; agDevHandle = oneExpander->agDevHandle; agSASRequestBody = &(dmSMPRequestBody->agSASRequestBody); discovery = &(onePortContext->discovery); discovery->SMPRetries++; if (discovery->SMPRetries < SMP_BUSY_RETRIES) { status = saSMPStart( agRoot, agIORequest, 0, agDevHandle, AGSA_SMP_INIT_REQ, agSASRequestBody, &dmsaSMPCompleted ); } if (status == AGSA_RC_SUCCESS) { discovery->SMPRetries = 0; tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->SMPBusyTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->SMPBusyTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } } else if (status == AGSA_RC_FAILURE) { tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->SMPBusyTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->SMPBusyTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } discovery->SMPRetries = 0; dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } else /* AGSA_RC_BUSY */ { if (discovery->SMPRetries >= SMP_BUSY_RETRIES) { /* done with retris; give up */ DM_DBG3(("dmSMPBusyTimerCB: retries are over\n")); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->SMPBusyTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->SMPBusyTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } discovery->SMPRetries = 0; dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); } else { /* keep retrying */ dmSMPBusyTimer(dmRoot, onePortContext, oneDeviceData, dmSMPRequestBody); } } return; } /* expander configuring timer */ osGLOBAL void dmDiscoveryConfiguringTimer(dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmDeviceData_t *oneDeviceData ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDiscovery_t *discovery; DM_DBG3(("dmDiscoveryConfiguringTimer: start\n")); DM_DBG3(("dmDiscoveryConfiguringTimer: pid %d\n", onePortContext->id)); discovery = &(onePortContext->discovery); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->discoveryTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->discoveryTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } DM_DBG3(("dmDiscoveryConfiguringTimer: UsecsPerTick %d\n", dmAllShared->usecsPerTick)); DM_DBG3(("dmDiscoveryConfiguringTimer: Timervalue %d\n", DISCOVERY_CONFIGURING_TIMER_VALUE/dmAllShared->usecsPerTick)); dmSetTimerRequest( dmRoot, &discovery->discoveryTimer, DISCOVERY_CONFIGURING_TIMER_VALUE/dmAllShared->usecsPerTick, dmDiscoveryConfiguringTimerCB, onePortContext, oneDeviceData, agNULL ); dmAddTimer ( dmRoot, &dmAllShared->timerlist, &discovery->discoveryTimer ); return; } osGLOBAL void dmDiscoveryConfiguringTimerCB( dmRoot_t * dmRoot, void * timerData1, void * timerData2, void * timerData3 ) { dmIntPortContext_t *onePortContext = agNULL; dmDiscovery_t *discovery = agNULL; dmDeviceData_t *oneDeviceData = agNULL; onePortContext = (dmIntPortContext_t *)timerData1; oneDeviceData = (dmDeviceData_t *)timerData2; discovery = &(onePortContext->discovery); DM_DBG3(("dmDiscoveryConfiguringTimerCB: start\n")); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->discoveryTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->discoveryTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } if (oneDeviceData->valid == agTRUE || oneDeviceData->valid2 == agTRUE) { dmReportGeneralSend(dmRoot, oneDeviceData); } return; } osGLOBAL void dmConfigureRouteTimer(dmRoot_t *dmRoot, dmIntPortContext_t *onePortContext, dmExpander_t *oneExpander, smpRespDiscover_t *pdmSMPDiscoverResp, smpRespDiscover2_t *pdmSMPDiscover2Resp ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmDiscovery_t *discovery; DM_DBG3(("dmConfigureRouteTimer: start\n")); DM_DBG3(("dmConfigureRouteTimer: pid %d\n", onePortContext->id)); discovery = &(onePortContext->discovery); DM_DBG3(("dmConfigureRouteTimer: onePortContext %p oneExpander %p pdmSMPDiscoverResp %p\n", onePortContext, oneExpander, pdmSMPDiscoverResp)); DM_DBG3(("dmConfigureRouteTimer: discovery %p \n", discovery)); DM_DBG3(("dmConfigureRouteTimer: pid %d configureRouteRetries %d\n", onePortContext->id, discovery->configureRouteRetries)); DM_DBG3(("dmConfigureRouteTimer: discovery->status %d\n", discovery->status)); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->configureRouteTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->configureRouteTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } DM_DBG3(("dmConfigureRouteTimer: UsecsPerTick %d\n", dmAllShared->usecsPerTick)); DM_DBG3(("dmConfigureRouteTimer: Timervalue %d\n", CONFIGURE_ROUTE_TIMER_VALUE/dmAllShared->usecsPerTick)); if (oneExpander->SAS2 == 0) { /* SAS 1.1 */ dmSetTimerRequest( dmRoot, &discovery->configureRouteTimer, CONFIGURE_ROUTE_TIMER_VALUE/dmAllShared->usecsPerTick, dmConfigureRouteTimerCB, (void *)onePortContext, (void *)oneExpander, (void *)pdmSMPDiscoverResp ); } else { /* SAS 2 */ dmSetTimerRequest( dmRoot, &discovery->configureRouteTimer, CONFIGURE_ROUTE_TIMER_VALUE/dmAllShared->usecsPerTick, dmConfigureRouteTimerCB, (void *)onePortContext, (void *)oneExpander, (void *)pdmSMPDiscover2Resp ); } dmAddTimer ( dmRoot, &dmAllShared->timerlist, &discovery->configureRouteTimer ); return; } osGLOBAL void dmConfigureRouteTimerCB( dmRoot_t * dmRoot, void * timerData1, void * timerData2, void * timerData3 ) { dmIntRoot_t *dmIntRoot = (dmIntRoot_t *)dmRoot->dmData; dmIntContext_t *dmAllShared = (dmIntContext_t *)&dmIntRoot->dmAllShared; dmIntPortContext_t *onePortContext; dmExpander_t *oneExpander; smpRespDiscover_t *pdmSMPDiscoverResp = agNULL; smpRespDiscover2_t *pdmSMPDiscover2Resp = agNULL; dmDiscovery_t *discovery; DM_DBG3(("dmConfigureRouteTimerCB: start\n")); onePortContext = (dmIntPortContext_t *)timerData1; oneExpander = (dmExpander_t *)timerData2; if (oneExpander->SAS2 == 0) { pdmSMPDiscoverResp = (smpRespDiscover_t *)timerData3; } else { pdmSMPDiscover2Resp = (smpRespDiscover2_t *)timerData3; } discovery = &(onePortContext->discovery); DM_DBG3(("dmConfigureRouteTimerCB: onePortContext %p oneExpander %p pdmSMPDiscoverResp %p\n", onePortContext, oneExpander, pdmSMPDiscoverResp)); DM_DBG3(("dmConfigureRouteTimerCB: discovery %p\n", discovery)); DM_DBG3(("dmConfigureRouteTimerCB: pid %d configureRouteRetries %d\n", onePortContext->id, discovery->configureRouteRetries)); DM_DBG3(("dmConfigureRouteTimerCB: discovery.status %d\n", discovery->status)); discovery->configureRouteRetries++; if (discovery->configureRouteRetries >= dmAllShared->MaxRetryDiscovery) { DM_DBG3(("dmConfigureRouteTimerCB: retries are over\n")); tddmSingleThreadedEnter(dmRoot, DM_TIMER_LOCK); if (discovery->configureRouteTimer.timerRunning == agTRUE) { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); dmKillTimer( dmRoot, &discovery->configureRouteTimer ); } else { tddmSingleThreadedLeave(dmRoot, DM_TIMER_LOCK); } discovery->configureRouteRetries = 0; /* failed the discovery */ dmDiscoverDone(dmRoot, onePortContext, DM_RC_FAILURE); return; } if (oneExpander->SAS2 == 0) { if (onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG3(("dmConfigureRouteTimerCB: proceed by calling dmDownStreamDiscoverExpanderPhy\n")); dmhexdump("dmConfigureRouteTimerCB", (bit8*)pdmSMPDiscoverResp, sizeof(smpRespDiscover_t)); discovery->configureRouteRetries = 0; dmDownStreamDiscoverExpanderPhy(dmRoot, onePortContext, oneExpander, pdmSMPDiscoverResp); } else { DM_DBG3(("dmConfigureRouteTimerCB: setting timer again\n")); /* set the timer again */ dmSetTimerRequest( dmRoot, &discovery->configureRouteTimer, CONFIGURE_ROUTE_TIMER_VALUE/dmAllShared->usecsPerTick, dmConfigureRouteTimerCB, (void *)onePortContext, (void *)oneExpander, (void *)pdmSMPDiscoverResp ); dmAddTimer ( dmRoot, &dmAllShared->timerlist, &discovery->configureRouteTimer ); } } /* SAS 1.1 */ else { /* SAS 2 */ if (onePortContext->discovery.status == DISCOVERY_DOWN_STREAM) { DM_DBG2(("dmConfigureRouteTimerCB: proceed by calling dmDownStreamDiscover2ExpanderPhy\n")); dmhexdump("dmConfigureRouteTimerCB", (bit8*)pdmSMPDiscover2Resp, sizeof(smpRespDiscover2_t)); dmDownStreamDiscover2ExpanderPhy(dmRoot, onePortContext, oneExpander, pdmSMPDiscover2Resp); } else { DM_DBG2(("dmConfigureRouteTimerCB: setting timer again\n")); /* set the timer again */ dmSetTimerRequest( dmRoot, &discovery->configureRouteTimer, CONFIGURE_ROUTE_TIMER_VALUE/dmAllShared->usecsPerTick, dmConfigureRouteTimerCB, (void *)onePortContext, (void *)oneExpander, (void *)pdmSMPDiscover2Resp ); dmAddTimer ( dmRoot, &dmAllShared->timerlist, &discovery->configureRouteTimer ); } } return; } #endif /* FDS_ DM */ Index: head/sys/dev/ppbus/lpt.c =================================================================== --- head/sys/dev/ppbus/lpt.c (revision 357663) +++ head/sys/dev/ppbus/lpt.c (revision 357664) @@ -1,1004 +1,1004 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1990 William F. Jolitz, TeleMuse * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This software is a component of "386BSD" developed by * William F. Jolitz, TeleMuse. * 4. Neither the name of the developer nor the name "386BSD" * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT. * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT * NOT MAKE USE OF THIS WORK. * * FOR USERS WHO WISH TO UNDERSTAND THE 386BSD SYSTEM DEVELOPED * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN * REFERENCES SUCH AS THE "PORTING UNIX TO THE 386" SERIES * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS * OF 386BSD ENTITLED "386BSD FROM THE INSIDE OUT" WILL BE AVAILABLE LATE 1992. * * THIS SOFTWARE IS PROVIDED BY THE DEVELOPER ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE DEVELOPER BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: unknown origin, 386BSD 0.1 * From Id: lpt.c,v 1.55.2.1 1996/11/12 09:08:38 phk Exp * From Id: nlpt.c,v 1.14 1999/02/08 13:55:43 des Exp */ #include __FBSDID("$FreeBSD$"); /* * Device Driver for AT parallel printer port * Written by William Jolitz 12/18/90 */ /* * Updated for ppbus by Nicolas Souchu * [Mon Jul 28 1997] */ #include "opt_lpt.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ppbus_if.h" #include #ifndef LPT_DEBUG #define lprintf(args) #else #define lprintf(args) \ do { \ if (lptflag) \ printf args; \ } while (0) static int volatile lptflag = 1; #endif #define LPINITRDY 4 /* wait up to 4 seconds for a ready */ #define LPTOUTINITIAL 10 /* initial timeout to wait for ready 1/10 s */ #define LPTOUTMAX 1 /* maximal timeout 1 s */ #define LPPRI (PZERO+8) #define BUFSIZE 1024 #define BUFSTATSIZE 32 struct lpt_data { device_t sc_dev; struct cdev *sc_cdev; struct cdev *sc_cdev_bypass; short sc_state; /* default case: negative prime, negative ack, handshake strobe, prime once */ u_char sc_control; char sc_flags; #define LP_POS_INIT 0x04 /* if we are a positive init signal */ #define LP_POS_ACK 0x08 /* if we are a positive going ack */ #define LP_NO_PRIME 0x10 /* don't prime the printer at all */ #define LP_PRIMEOPEN 0x20 /* prime on every open */ #define LP_AUTOLF 0x40 /* tell printer to do an automatic lf */ #define LP_BYPASS 0x80 /* bypass printer ready checks */ void *sc_inbuf; void *sc_statbuf; short sc_xfercnt ; char sc_primed; char *sc_cp ; u_short sc_irq ; /* IRQ status of port */ #define LP_HAS_IRQ 0x01 /* we have an irq available */ #define LP_USE_IRQ 0x02 /* we are using our irq */ #define LP_ENABLE_IRQ 0x04 /* enable IRQ on open */ #define LP_ENABLE_EXT 0x10 /* we shall use advanced mode when possible */ u_char sc_backoff ; /* time to call lptout() again */ struct callout sc_timer; struct resource *sc_intr_resource; /* interrupt resource */ void *sc_intr_cookie; /* interrupt cookie */ }; #define LPT_NAME "lpt" /* our official name */ static callout_func_t lptout; static int lpt_port_test(device_t dev, u_char data, u_char mask); static int lpt_detect(device_t dev); #define DEVTOSOFTC(dev) \ ((struct lpt_data *)device_get_softc(dev)) static void lptintr(void *arg); static devclass_t lpt_devclass; /* bits for state */ #define OPEN (1<<0) /* device is open */ #define ASLP (1<<1) /* awaiting draining of printer */ #define EERROR (1<<2) /* error was received from printer */ #define OBUSY (1<<3) /* printer is busy doing output */ #define LPTOUT (1<<4) /* timeout while not selected */ #define TOUT (1<<5) /* timeout while not selected */ #define LPTINIT (1<<6) /* waiting to initialize for open */ #define INTERRUPTED (1<<7) /* write call was interrupted */ #define HAVEBUS (1<<8) /* the driver owns the bus */ /* status masks to interrogate printer status */ #define RDY_MASK (LPS_SEL|LPS_OUT|LPS_NBSY|LPS_NERR) /* ready ? */ #define LP_READY (LPS_SEL|LPS_NBSY|LPS_NERR) /* Printer Ready condition - from lpa.c */ /* Only used in polling code */ #define LPS_INVERT (LPS_NBSY | LPS_NACK | LPS_SEL | LPS_NERR) #define LPS_MASK (LPS_NBSY | LPS_NACK | LPS_OUT | LPS_SEL | LPS_NERR) #define NOT_READY(ppbus) ((ppb_rstr(ppbus)^LPS_INVERT)&LPS_MASK) #define MAX_SLEEP (hz*5) /* Timeout while waiting for device ready */ #define MAX_SPIN 20 /* Max delay for device ready in usecs */ static d_open_t lptopen; static d_close_t lptclose; static d_write_t lptwrite; static d_read_t lptread; static d_ioctl_t lptioctl; static struct cdevsw lpt_cdevsw = { .d_version = D_VERSION, .d_open = lptopen, .d_close = lptclose, .d_read = lptread, .d_write = lptwrite, .d_ioctl = lptioctl, .d_name = LPT_NAME, }; static int lpt_request_ppbus(device_t dev, int how) { device_t ppbus = device_get_parent(dev); struct lpt_data *sc = DEVTOSOFTC(dev); int error; /* * We might already have the bus for a write(2) after an interrupted * write(2) call. */ ppb_assert_locked(ppbus); if (sc->sc_state & HAVEBUS) return (0); error = ppb_request_bus(ppbus, dev, how); if (error == 0) sc->sc_state |= HAVEBUS; return (error); } static int lpt_release_ppbus(device_t dev) { device_t ppbus = device_get_parent(dev); struct lpt_data *sc = DEVTOSOFTC(dev); int error = 0; ppb_assert_locked(ppbus); if (sc->sc_state & HAVEBUS) { error = ppb_release_bus(ppbus, dev); if (error == 0) sc->sc_state &= ~HAVEBUS; } return (error); } /* * Internal routine to lptprobe to do port tests of one byte value */ static int lpt_port_test(device_t ppbus, u_char data, u_char mask) { int temp, timeout; data = data & mask; ppb_wdtr(ppbus, data); timeout = 10000; do { DELAY(10); temp = ppb_rdtr(ppbus) & mask; } while (temp != data && --timeout); lprintf(("out=%x\tin=%x\ttout=%d\n", data, temp, timeout)); return (temp == data); } /* * Probe simplified by replacing multiple loops with a hardcoded * test pattern - 1999/02/08 des@freebsd.org * * New lpt port probe Geoff Rehmet - Rhodes University - 14/2/94 * Based partially on Rod Grimes' printer probe * * Logic: * 1) If no port address was given, use the bios detected ports * and autodetect what ports the printers are on. * 2) Otherwise, probe the data port at the address given, * using the method in Rod Grimes' port probe. * (Much code ripped off directly from Rod's probe.) * * Comments from Rod's probe: * Logic: * 1) You should be able to write to and read back the same value * to the data port. Do an alternating zeros, alternating ones, * walking zero, and walking one test to check for stuck bits. * * 2) You should be able to write to and read back the same value * to the control port lower 5 bits, the upper 3 bits are reserved - * per the IBM PC technical reference manauls and different boards + * per the IBM PC technical reference manuals and different boards * do different things with them. Do an alternating zeros, alternating * ones, walking zero, and walking one test to check for stuck bits. * * Some printers drag the strobe line down when the are powered off * so this bit has been masked out of the control port test. * * XXX Some printers may not like a fast pulse on init or strobe, I * don't know at this point, if that becomes a problem these bits * should be turned off in the mask byte for the control port test. * * We are finally left with a mask of 0x14, due to some printers * being adamant about holding other bits high ........ * * Before probing the control port, we write a 0 to the data port - * If not, some printers chuck out garbage when the strobe line * gets toggled. * * 3) Set the data and control ports to a value of 0 * * This probe routine has been tested on Epson Lx-800, HP LJ3P, * Epson FX-1170 and C.Itoh 8510RM * printers. * Quick exit on fail added. */ static int lpt_detect(device_t dev) { device_t ppbus = device_get_parent(dev); static u_char testbyte[18] = { 0x55, /* alternating zeros */ 0xaa, /* alternating ones */ 0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f, /* walking zero */ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 /* walking one */ }; int i, error, status; status = 1; /* assume success */ ppb_lock(ppbus); if ((error = lpt_request_ppbus(dev, PPB_DONTWAIT))) { ppb_unlock(ppbus); device_printf(dev, "cannot alloc ppbus (%d)!\n", error); return (0); } for (i = 0; i < 18 && status; i++) if (!lpt_port_test(ppbus, testbyte[i], 0xff)) { status = 0; break; } /* write 0's to control and data ports */ ppb_wdtr(ppbus, 0); ppb_wctr(ppbus, 0); lpt_release_ppbus(dev); ppb_unlock(ppbus); return (status); } static void lpt_identify(driver_t *driver, device_t parent) { device_t dev; dev = device_find_child(parent, LPT_NAME, -1); if (!dev) BUS_ADD_CHILD(parent, 0, LPT_NAME, -1); } /* * lpt_probe() */ static int lpt_probe(device_t dev) { if (!lpt_detect(dev)) return (ENXIO); device_set_desc(dev, "Printer"); return (0); } static int lpt_attach(device_t dev) { device_t ppbus = device_get_parent(dev); struct lpt_data *sc = DEVTOSOFTC(dev); int rid = 0, unit = device_get_unit(dev); int error; sc->sc_primed = 0; /* not primed yet */ ppb_init_callout(ppbus, &sc->sc_timer, 0); ppb_lock(ppbus); if ((error = lpt_request_ppbus(dev, PPB_DONTWAIT))) { ppb_unlock(ppbus); device_printf(dev, "cannot alloc ppbus (%d)!\n", error); return (0); } ppb_wctr(ppbus, LPC_NINIT); lpt_release_ppbus(dev); ppb_unlock(ppbus); /* declare our interrupt handler */ sc->sc_intr_resource = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE); if (sc->sc_intr_resource) { error = bus_setup_intr(dev, sc->sc_intr_resource, INTR_TYPE_TTY | INTR_MPSAFE, NULL, lptintr, sc, &sc->sc_intr_cookie); if (error) { bus_release_resource(dev, SYS_RES_IRQ, rid, sc->sc_intr_resource); device_printf(dev, "Unable to register interrupt handler\n"); return (error); } sc->sc_irq = LP_HAS_IRQ | LP_USE_IRQ | LP_ENABLE_IRQ; device_printf(dev, "Interrupt-driven port\n"); } else { sc->sc_irq = 0; device_printf(dev, "Polled port\n"); } lprintf(("irq %x\n", sc->sc_irq)); sc->sc_inbuf = malloc(BUFSIZE, M_DEVBUF, M_WAITOK); sc->sc_statbuf = malloc(BUFSTATSIZE, M_DEVBUF, M_WAITOK); sc->sc_dev = dev; sc->sc_cdev = make_dev(&lpt_cdevsw, unit, UID_ROOT, GID_WHEEL, 0600, LPT_NAME "%d", unit); sc->sc_cdev->si_drv1 = sc; sc->sc_cdev->si_drv2 = 0; sc->sc_cdev_bypass = make_dev(&lpt_cdevsw, unit, UID_ROOT, GID_WHEEL, 0600, LPT_NAME "%d.ctl", unit); sc->sc_cdev_bypass->si_drv1 = sc; sc->sc_cdev_bypass->si_drv2 = (void *)LP_BYPASS; return (0); } static int lpt_detach(device_t dev) { struct lpt_data *sc = DEVTOSOFTC(dev); device_t ppbus = device_get_parent(dev); destroy_dev(sc->sc_cdev); destroy_dev(sc->sc_cdev_bypass); ppb_lock(ppbus); lpt_release_ppbus(dev); ppb_unlock(ppbus); callout_drain(&sc->sc_timer); if (sc->sc_intr_resource != NULL) { bus_teardown_intr(dev, sc->sc_intr_resource, sc->sc_intr_cookie); bus_release_resource(dev, SYS_RES_IRQ, 0, sc->sc_intr_resource); } free(sc->sc_inbuf, M_DEVBUF); free(sc->sc_statbuf, M_DEVBUF); return (0); } static void lptout(void *arg) { struct lpt_data *sc = arg; device_t dev = sc->sc_dev; device_t ppbus; ppbus = device_get_parent(dev); ppb_assert_locked(ppbus); lprintf(("T %x ", ppb_rstr(ppbus))); if (sc->sc_state & OPEN) { sc->sc_backoff++; if (sc->sc_backoff > hz/LPTOUTMAX) sc->sc_backoff = hz/LPTOUTMAX; callout_reset(&sc->sc_timer, sc->sc_backoff, lptout, sc); } else sc->sc_state &= ~TOUT; if (sc->sc_state & EERROR) sc->sc_state &= ~EERROR; /* * Avoid possible hangs due to missed interrupts */ if (sc->sc_xfercnt) { lptintr(sc); } else { sc->sc_state &= ~OBUSY; wakeup(dev); } } /* * lptopen -- reset the printer, then wait until it's selected and not busy. * If LP_BYPASS flag is selected, then we do not try to select the * printer -- this is just used for passing ioctls. */ static int lptopen(struct cdev *dev, int flags, int fmt, struct thread *td) { int trys, err; struct lpt_data *sc = dev->si_drv1; device_t lptdev; device_t ppbus; if (!sc) return (ENXIO); lptdev = sc->sc_dev; ppbus = device_get_parent(lptdev); ppb_lock(ppbus); if (sc->sc_state) { lprintf(("%s: still open %x\n", device_get_nameunit(lptdev), sc->sc_state)); ppb_unlock(ppbus); return(EBUSY); } else sc->sc_state |= LPTINIT; sc->sc_flags = (uintptr_t)dev->si_drv2; /* Check for open with BYPASS flag set. */ if (sc->sc_flags & LP_BYPASS) { sc->sc_state = OPEN; ppb_unlock(ppbus); return(0); } /* request the ppbus only if we don't have it already */ if ((err = lpt_request_ppbus(lptdev, PPB_WAIT|PPB_INTR)) != 0) { /* give it a chance to try later */ sc->sc_state = 0; ppb_unlock(ppbus); return (err); } lprintf(("%s flags 0x%x\n", device_get_nameunit(lptdev), sc->sc_flags)); /* set IRQ status according to ENABLE_IRQ flag */ if (sc->sc_irq & LP_ENABLE_IRQ) sc->sc_irq |= LP_USE_IRQ; else sc->sc_irq &= ~LP_USE_IRQ; /* init printer */ if ((sc->sc_flags & LP_NO_PRIME) == 0) { if ((sc->sc_flags & LP_PRIMEOPEN) || sc->sc_primed == 0) { ppb_wctr(ppbus, 0); sc->sc_primed++; DELAY(500); } } ppb_wctr(ppbus, LPC_SEL|LPC_NINIT); /* wait till ready (printer running diagnostics) */ trys = 0; do { /* ran out of waiting for the printer */ if (trys++ >= LPINITRDY*4) { lprintf(("status %x\n", ppb_rstr(ppbus))); lpt_release_ppbus(lptdev); sc->sc_state = 0; ppb_unlock(ppbus); return (EBUSY); } /* wait 1/4 second, give up if we get a signal */ if (ppb_sleep(ppbus, lptdev, LPPRI | PCATCH, "lptinit", hz / 4) != EWOULDBLOCK) { lpt_release_ppbus(lptdev); sc->sc_state = 0; ppb_unlock(ppbus); return (EBUSY); } /* is printer online and ready for output */ } while ((ppb_rstr(ppbus) & (LPS_SEL|LPS_OUT|LPS_NBSY|LPS_NERR)) != (LPS_SEL|LPS_NBSY|LPS_NERR)); sc->sc_control = LPC_SEL|LPC_NINIT; if (sc->sc_flags & LP_AUTOLF) sc->sc_control |= LPC_AUTOL; /* enable interrupt if interrupt-driven */ if (sc->sc_irq & LP_USE_IRQ) sc->sc_control |= LPC_ENA; ppb_wctr(ppbus, sc->sc_control); sc->sc_state &= ~LPTINIT; sc->sc_state |= OPEN; sc->sc_xfercnt = 0; /* only use timeout if using interrupt */ lprintf(("irq %x\n", sc->sc_irq)); if (sc->sc_irq & LP_USE_IRQ) { sc->sc_state |= TOUT; sc->sc_backoff = hz / LPTOUTINITIAL; callout_reset(&sc->sc_timer, sc->sc_backoff, lptout, sc); } /* release the ppbus */ lpt_release_ppbus(lptdev); ppb_unlock(ppbus); lprintf(("opened.\n")); return(0); } /* * lptclose -- close the device, free the local line buffer. * * Check for interrupted write call added. */ static int lptclose(struct cdev *dev, int flags, int fmt, struct thread *td) { struct lpt_data *sc = dev->si_drv1; device_t lptdev = sc->sc_dev; device_t ppbus = device_get_parent(lptdev); int err; ppb_lock(ppbus); if (sc->sc_flags & LP_BYPASS) goto end_close; if ((err = lpt_request_ppbus(lptdev, PPB_WAIT|PPB_INTR)) != 0) { ppb_unlock(ppbus); return (err); } /* if the last write was interrupted, don't complete it */ if ((!(sc->sc_state & INTERRUPTED)) && (sc->sc_irq & LP_USE_IRQ)) while ((ppb_rstr(ppbus) & (LPS_SEL|LPS_OUT|LPS_NBSY|LPS_NERR)) != (LPS_SEL|LPS_NBSY|LPS_NERR) || sc->sc_xfercnt) /* wait 1 second, give up if we get a signal */ if (ppb_sleep(ppbus, lptdev, LPPRI | PCATCH, "lpclose", hz) != EWOULDBLOCK) break; sc->sc_state &= ~OPEN; callout_stop(&sc->sc_timer); ppb_wctr(ppbus, LPC_NINIT); /* * unregistration of interrupt forced by release */ lpt_release_ppbus(lptdev); end_close: sc->sc_state = 0; sc->sc_xfercnt = 0; ppb_unlock(ppbus); lprintf(("closed.\n")); return(0); } /* * lpt_pushbytes() * Workhorse for actually spinning and writing bytes to printer * Derived from lpa.c * Originally by ? * * This code is only used when we are polling the port */ static int lpt_pushbytes(struct lpt_data *sc) { device_t dev = sc->sc_dev; device_t ppbus = device_get_parent(dev); int spin, err, tic; char ch; ppb_assert_locked(ppbus); lprintf(("p")); /* loop for every character .. */ while (sc->sc_xfercnt > 0) { /* printer data */ ch = *(sc->sc_cp); sc->sc_cp++; sc->sc_xfercnt--; /* * Wait for printer ready. * Loop 20 usecs testing BUSY bit, then sleep * for exponentially increasing timeout. (vak) */ for (spin = 0; NOT_READY(ppbus) && spin < MAX_SPIN; ++spin) DELAY(1); /* XXX delay is NOT this accurate! */ if (spin >= MAX_SPIN) { tic = 0; while (NOT_READY(ppbus)) { /* * Now sleep, every cycle a * little longer .. */ tic = tic + tic + 1; /* * But no more than 10 seconds. (vak) */ if (tic > MAX_SLEEP) tic = MAX_SLEEP; err = ppb_sleep(ppbus, dev, LPPRI, LPT_NAME "poll", tic); if (err != EWOULDBLOCK) { return (err); } } } /* output data */ ppb_wdtr(ppbus, ch); /* strobe */ ppb_wctr(ppbus, sc->sc_control|LPC_STB); ppb_wctr(ppbus, sc->sc_control); } return(0); } /* * lptread --retrieve printer status in IEEE1284 NIBBLE mode */ static int lptread(struct cdev *dev, struct uio *uio, int ioflag) { struct lpt_data *sc = dev->si_drv1; device_t lptdev = sc->sc_dev; device_t ppbus = device_get_parent(lptdev); int error = 0, len; if (sc->sc_flags & LP_BYPASS) { /* we can't do reads in bypass mode */ return (EPERM); } ppb_lock(ppbus); if ((error = ppb_1284_negociate(ppbus, PPB_NIBBLE, 0))) { ppb_unlock(ppbus); return (error); } /* read data in an other buffer, read/write may be simultaneous */ len = 0; while (uio->uio_resid) { if ((error = ppb_1284_read(ppbus, PPB_NIBBLE, sc->sc_statbuf, min(BUFSTATSIZE, uio->uio_resid), &len))) { goto error; } if (!len) goto error; /* no more data */ ppb_unlock(ppbus); error = uiomove(sc->sc_statbuf, len, uio); ppb_lock(ppbus); if (error) goto error; } error: ppb_1284_terminate(ppbus); ppb_unlock(ppbus); return (error); } /* * lptwrite --copy a line from user space to a local buffer, then call * putc to get the chars moved to the output queue. * * Flagging of interrupted write added. */ static int lptwrite(struct cdev *dev, struct uio *uio, int ioflag) { register unsigned n; int err; struct lpt_data *sc = dev->si_drv1; device_t lptdev = sc->sc_dev; device_t ppbus = device_get_parent(lptdev); if (sc->sc_flags & LP_BYPASS) { /* we can't do writes in bypass mode */ return (EPERM); } /* request the ppbus only if we don't have it already */ ppb_lock(ppbus); if ((err = lpt_request_ppbus(lptdev, PPB_WAIT|PPB_INTR)) != 0) { ppb_unlock(ppbus); return (err); } sc->sc_state &= ~INTERRUPTED; while ((n = min(BUFSIZE, uio->uio_resid)) != 0) { sc->sc_cp = sc->sc_inbuf; ppb_unlock(ppbus); err = uiomove(sc->sc_cp, n, uio); ppb_lock(ppbus); if (err) break; sc->sc_xfercnt = n; if (sc->sc_irq & LP_ENABLE_EXT) { /* try any extended mode */ err = ppb_write(ppbus, sc->sc_cp, sc->sc_xfercnt, 0); switch (err) { case 0: /* if not all data was sent, we could rely * on polling for the last bytes */ sc->sc_xfercnt = 0; break; case EINTR: sc->sc_state |= INTERRUPTED; ppb_unlock(ppbus); return (err); case EINVAL: /* advanced mode not avail */ log(LOG_NOTICE, "%s: advanced mode not avail, polling\n", device_get_nameunit(sc->sc_dev)); break; default: ppb_unlock(ppbus); return (err); } } else while ((sc->sc_xfercnt > 0)&&(sc->sc_irq & LP_USE_IRQ)) { lprintf(("i")); /* if the printer is ready for a char, */ /* give it one */ if ((sc->sc_state & OBUSY) == 0){ lprintf(("\nC %d. ", sc->sc_xfercnt)); lptintr(sc); } lprintf(("W ")); if (sc->sc_state & OBUSY) if ((err = ppb_sleep(ppbus, lptdev, LPPRI|PCATCH, LPT_NAME "write", 0))) { sc->sc_state |= INTERRUPTED; ppb_unlock(ppbus); return(err); } } /* check to see if we must do a polled write */ if (!(sc->sc_irq & LP_USE_IRQ) && (sc->sc_xfercnt)) { lprintf(("p")); err = lpt_pushbytes(sc); if (err) { ppb_unlock(ppbus); return (err); } } } /* we have not been interrupted, release the ppbus */ lpt_release_ppbus(lptdev); ppb_unlock(ppbus); return (err); } /* * lptintr -- handle printer interrupts which occur when the printer is * ready to accept another char. * * do checking for interrupted write call. */ static void lptintr(void *arg) { struct lpt_data *sc = arg; device_t lptdev = sc->sc_dev; device_t ppbus = device_get_parent(lptdev); int sts = 0; int i; /* * Is printer online and ready for output? * * Avoid falling back to lptout() too quickly. First spin-loop * to see if the printer will become ready ``really soon now''. */ for (i = 0; i < 100 && ((sts=ppb_rstr(ppbus)) & RDY_MASK) != LP_READY; i++) ; if ((sts & RDY_MASK) == LP_READY) { sc->sc_state = (sc->sc_state | OBUSY) & ~EERROR; sc->sc_backoff = hz / LPTOUTINITIAL; if (sc->sc_xfercnt) { /* send char */ /*lprintf(("%x ", *sc->sc_cp)); */ ppb_wdtr(ppbus, *sc->sc_cp++) ; ppb_wctr(ppbus, sc->sc_control|LPC_STB); /* DELAY(X) */ ppb_wctr(ppbus, sc->sc_control); /* any more data for printer */ if (--(sc->sc_xfercnt) > 0) return; } /* * No more data waiting for printer. * Wakeup is not done if write call was not interrupted. */ sc->sc_state &= ~OBUSY; if (!(sc->sc_state & INTERRUPTED)) wakeup(lptdev); lprintf(("w ")); return; } else { /* check for error */ if (((sts & (LPS_NERR | LPS_OUT) ) != LPS_NERR) && (sc->sc_state & OPEN)) sc->sc_state |= EERROR; /* lptout() will jump in and try to restart. */ } lprintf(("sts %x ", sts)); } static int lptioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags, struct thread *td) { int error = 0; struct lpt_data *sc = dev->si_drv1; device_t ppbus; u_char old_sc_irq; /* old printer IRQ status */ switch (cmd) { case LPT_IRQ : ppbus = device_get_parent(sc->sc_dev); ppb_lock(ppbus); if (sc->sc_irq & LP_HAS_IRQ) { /* * NOTE: * If the IRQ status is changed, * this will only be visible on the * next open. * * If interrupt status changes, * this gets syslog'd. */ old_sc_irq = sc->sc_irq; switch (*(int*)data) { case 0: sc->sc_irq &= (~LP_ENABLE_IRQ); break; case 1: sc->sc_irq &= (~LP_ENABLE_EXT); sc->sc_irq |= LP_ENABLE_IRQ; break; case 2: /* classic irq based transfer and advanced * modes are in conflict */ sc->sc_irq &= (~LP_ENABLE_IRQ); sc->sc_irq |= LP_ENABLE_EXT; break; case 3: sc->sc_irq &= (~LP_ENABLE_EXT); break; default: break; } if (old_sc_irq != sc->sc_irq ) log(LOG_NOTICE, "%s: switched to %s %s mode\n", device_get_nameunit(sc->sc_dev), (sc->sc_irq & LP_ENABLE_IRQ)? "interrupt-driven":"polled", (sc->sc_irq & LP_ENABLE_EXT)? "extended":"standard"); } else /* polled port */ error = EOPNOTSUPP; ppb_unlock(ppbus); break; default: error = ENODEV; } return(error); } static device_method_t lpt_methods[] = { /* device interface */ DEVMETHOD(device_identify, lpt_identify), DEVMETHOD(device_probe, lpt_probe), DEVMETHOD(device_attach, lpt_attach), DEVMETHOD(device_detach, lpt_detach), { 0, 0 } }; static driver_t lpt_driver = { LPT_NAME, lpt_methods, sizeof(struct lpt_data), }; DRIVER_MODULE(lpt, ppbus, lpt_driver, lpt_devclass, 0, 0); MODULE_DEPEND(lpt, ppbus, 1, 1, 1); Index: head/sys/dev/puc/pucdata.c =================================================================== --- head/sys/dev/puc/pucdata.c (revision 357663) +++ head/sys/dev/puc/pucdata.c (revision 357664) @@ -1,1871 +1,1871 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * PCI "universal" communications card driver configuration data (used to * match/attach the cards). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include static puc_config_f puc_config_advantech; static puc_config_f puc_config_amc; static puc_config_f puc_config_diva; static puc_config_f puc_config_exar; static puc_config_f puc_config_exar_pcie; static puc_config_f puc_config_icbook; static puc_config_f puc_config_moxa; static puc_config_f puc_config_oxford_pci954; static puc_config_f puc_config_oxford_pcie; static puc_config_f puc_config_quatech; static puc_config_f puc_config_syba; static puc_config_f puc_config_siig; static puc_config_f puc_config_sunix; static puc_config_f puc_config_timedia; static puc_config_f puc_config_titan; const struct puc_cfg puc_pci_devices[] = { { 0x0009, 0x7168, 0xffff, 0, "Sunix SUN1889", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, 8, }, { 0x103c, 0x1048, 0x103c, 0x1049, "HP Diva Serial [GSP] Multiport UART - Tosca Console", DEFAULT_RCLK, PUC_PORT_3S, 0x10, 0, -1, .config_function = puc_config_diva }, { 0x103c, 0x1048, 0x103c, 0x104a, "HP Diva Serial [GSP] Multiport UART - Tosca Secondary", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 0, -1, .config_function = puc_config_diva }, { 0x103c, 0x1048, 0x103c, 0x104b, "HP Diva Serial [GSP] Multiport UART - Maestro SP2", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 0, -1, .config_function = puc_config_diva }, { 0x103c, 0x1048, 0x103c, 0x1223, "HP Diva Serial [GSP] Multiport UART - Superdome Console", DEFAULT_RCLK, PUC_PORT_3S, 0x10, 0, -1, .config_function = puc_config_diva }, { 0x103c, 0x1048, 0x103c, 0x1226, "HP Diva Serial [GSP] Multiport UART - Keystone SP2", DEFAULT_RCLK, PUC_PORT_3S, 0x10, 0, -1, .config_function = puc_config_diva }, { 0x103c, 0x1048, 0x103c, 0x1282, "HP Diva Serial [GSP] Multiport UART - Everest SP2", DEFAULT_RCLK, PUC_PORT_3S, 0x10, 0, -1, .config_function = puc_config_diva }, { 0x10b5, 0x1076, 0x10b5, 0x1076, "VScom PCI-800", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x18, 0, 8, }, { 0x10b5, 0x1077, 0x10b5, 0x1077, "VScom PCI-400", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x18, 0, 8, }, { 0x10b5, 0x1103, 0x10b5, 0x1103, "VScom PCI-200", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x18, 4, 0, }, /* * Boca Research Turbo Serial 658 (8 serial port) card. * Appears to be the same as Chase Research PLC PCI-FAST8 * and Perle PCI-FAST8 Multi-Port serial cards. */ { 0x10b5, 0x9050, 0x12e0, 0x0021, "Boca Research Turbo Serial 658", DEFAULT_RCLK * 4, PUC_PORT_8S, 0x18, 0, 8, }, { 0x10b5, 0x9050, 0x12e0, 0x0031, "Boca Research Turbo Serial 654", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x18, 0, 8, }, /* * Dolphin Peripherals 4035 (dual serial port) card. PLX 9050, with * a seemingly-lame EEPROM setup that puts the Dolphin IDs * into the subsystem fields, and claims that it's a * network/misc (0x02/0x80) device. */ { 0x10b5, 0x9050, 0xd84d, 0x6808, "Dolphin Peripherals 4035", DEFAULT_RCLK, PUC_PORT_2S, 0x18, 4, 0, }, /* * Dolphin Peripherals 4014 (dual parallel port) card. PLX 9050, with * a seemingly-lame EEPROM setup that puts the Dolphin IDs * into the subsystem fields, and claims that it's a * network/misc (0x02/0x80) device. */ { 0x10b5, 0x9050, 0xd84d, 0x6810, "Dolphin Peripherals 4014", 0, PUC_PORT_2P, 0x20, 4, 0, }, { 0x10e8, 0x818e, 0xffff, 0, "Applied Micro Circuits 8 Port UART", DEFAULT_RCLK, PUC_PORT_8S, 0x14, -1, -1, .config_function = puc_config_amc }, /* * The following members of the Digi International Neo series are * based on Exar PCI chips, f. e. the 8 port variants on XR17V258IV. * Accordingly, the PCIe versions of these cards incorporate a PLX * PCIe-PCI-bridge. */ { 0x114f, 0x00b0, 0xffff, 0, "Digi Neo PCI 4 Port", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x114f, 0x00b1, 0xffff, 0, "Digi Neo PCI 8 Port", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x114f, 0x00f0, 0xffff, 0, "Digi Neo PCIe 8 Port", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x114f, 0x00f1, 0xffff, 0, "Digi Neo PCIe 4 Port", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x114f, 0x00f2, 0xffff, 0, "Digi Neo PCIe 4 Port RJ45", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x114f, 0x00f3, 0xffff, 0, "Digi Neo PCIe 8 Port RJ45", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x11fe, 0x8010, 0xffff, 0, "Comtrol RocketPort 550/8 RJ11 part A", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8011, 0xffff, 0, "Comtrol RocketPort 550/8 RJ11 part B", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8012, 0xffff, 0, "Comtrol RocketPort 550/8 Octa part A", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8013, 0xffff, 0, "Comtrol RocketPort 550/8 Octa part B", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8014, 0xffff, 0, "Comtrol RocketPort 550/4 RJ45", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8015, 0xffff, 0, "Comtrol RocketPort 550/Quad", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8016, 0xffff, 0, "Comtrol RocketPort 550/16 part A", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8017, 0xffff, 0, "Comtrol RocketPort 550/16 part B", DEFAULT_RCLK * 4, PUC_PORT_12S, 0x10, 0, 8, }, { 0x11fe, 0x8018, 0xffff, 0, "Comtrol RocketPort 550/8 part A", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, { 0x11fe, 0x8019, 0xffff, 0, "Comtrol RocketPort 550/8 part B", DEFAULT_RCLK * 4, PUC_PORT_4S, 0x10, 0, 8, }, /* * IBM SurePOS 300 Series (481033H) serial ports * Details can be found on the IBM RSS websites */ { 0x1014, 0x0297, 0xffff, 0, "IBM SurePOS 300 Series (481033H) serial ports", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0 }, /* * SIIG Boards. * * SIIG provides documentation for their boards at: * */ { 0x131f, 0x1010, 0xffff, 0, "SIIG Cyber I/O PCI 16C550 (10x family)", DEFAULT_RCLK, PUC_PORT_1S1P, 0x18, 4, 0, }, { 0x131f, 0x1011, 0xffff, 0, "SIIG Cyber I/O PCI 16C650 (10x family)", DEFAULT_RCLK, PUC_PORT_1S1P, 0x18, 4, 0, }, { 0x131f, 0x1012, 0xffff, 0, "SIIG Cyber I/O PCI 16C850 (10x family)", DEFAULT_RCLK, PUC_PORT_1S1P, 0x18, 4, 0, }, { 0x131f, 0x1021, 0xffff, 0, "SIIG Cyber Parallel Dual PCI (10x family)", 0, PUC_PORT_2P, 0x18, 8, 0, }, { 0x131f, 0x1030, 0xffff, 0, "SIIG Cyber Serial Dual PCI 16C550 (10x family)", DEFAULT_RCLK, PUC_PORT_2S, 0x18, 4, 0, }, { 0x131f, 0x1031, 0xffff, 0, "SIIG Cyber Serial Dual PCI 16C650 (10x family)", DEFAULT_RCLK, PUC_PORT_2S, 0x18, 4, 0, }, { 0x131f, 0x1032, 0xffff, 0, "SIIG Cyber Serial Dual PCI 16C850 (10x family)", DEFAULT_RCLK, PUC_PORT_2S, 0x18, 4, 0, }, { 0x131f, 0x1034, 0xffff, 0, /* XXX really? */ "SIIG Cyber 2S1P PCI 16C550 (10x family)", DEFAULT_RCLK, PUC_PORT_2S1P, 0x18, 4, 0, }, { 0x131f, 0x1035, 0xffff, 0, /* XXX really? */ "SIIG Cyber 2S1P PCI 16C650 (10x family)", DEFAULT_RCLK, PUC_PORT_2S1P, 0x18, 4, 0, }, { 0x131f, 0x1036, 0xffff, 0, /* XXX really? */ "SIIG Cyber 2S1P PCI 16C850 (10x family)", DEFAULT_RCLK, PUC_PORT_2S1P, 0x18, 4, 0, }, { 0x131f, 0x1050, 0xffff, 0, "SIIG Cyber 4S PCI 16C550 (10x family)", DEFAULT_RCLK, PUC_PORT_4S, 0x18, 4, 0, }, { 0x131f, 0x1051, 0xffff, 0, "SIIG Cyber 4S PCI 16C650 (10x family)", DEFAULT_RCLK, PUC_PORT_4S, 0x18, 4, 0, }, { 0x131f, 0x1052, 0xffff, 0, "SIIG Cyber 4S PCI 16C850 (10x family)", DEFAULT_RCLK, PUC_PORT_4S, 0x18, 4, 0, }, { 0x131f, 0x2010, 0xffff, 0, "SIIG Cyber I/O PCI 16C550 (20x family)", DEFAULT_RCLK, PUC_PORT_1S1P, 0x10, 4, 0, }, { 0x131f, 0x2011, 0xffff, 0, "SIIG Cyber I/O PCI 16C650 (20x family)", DEFAULT_RCLK, PUC_PORT_1S1P, 0x10, 4, 0, }, { 0x131f, 0x2012, 0xffff, 0, "SIIG Cyber I/O PCI 16C850 (20x family)", DEFAULT_RCLK, PUC_PORT_1S1P, 0x10, 4, 0, }, { 0x131f, 0x2021, 0xffff, 0, "SIIG Cyber Parallel Dual PCI (20x family)", 0, PUC_PORT_2P, 0x10, 8, 0, }, { 0x131f, 0x2030, 0xffff, 0, "SIIG Cyber Serial Dual PCI 16C550 (20x family)", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x131f, 0x2031, 0xffff, 0, "SIIG Cyber Serial Dual PCI 16C650 (20x family)", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x131f, 0x2032, 0xffff, 0, "SIIG Cyber Serial Dual PCI 16C850 (20x family)", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x131f, 0x2040, 0xffff, 0, "SIIG Cyber 2P1S PCI 16C550 (20x family)", DEFAULT_RCLK, PUC_PORT_1S2P, 0x10, -1, 0, .config_function = puc_config_siig }, { 0x131f, 0x2041, 0xffff, 0, "SIIG Cyber 2P1S PCI 16C650 (20x family)", DEFAULT_RCLK, PUC_PORT_1S2P, 0x10, -1, 0, .config_function = puc_config_siig }, { 0x131f, 0x2042, 0xffff, 0, "SIIG Cyber 2P1S PCI 16C850 (20x family)", DEFAULT_RCLK, PUC_PORT_1S2P, 0x10, -1, 0, .config_function = puc_config_siig }, { 0x131f, 0x2050, 0xffff, 0, "SIIG Cyber 4S PCI 16C550 (20x family)", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x131f, 0x2051, 0xffff, 0, "SIIG Cyber 4S PCI 16C650 (20x family)", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x131f, 0x2052, 0xffff, 0, "SIIG Cyber 4S PCI 16C850 (20x family)", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x131f, 0x2060, 0xffff, 0, "SIIG Cyber 2S1P PCI 16C550 (20x family)", DEFAULT_RCLK, PUC_PORT_2S1P, 0x10, 4, 0, }, { 0x131f, 0x2061, 0xffff, 0, "SIIG Cyber 2S1P PCI 16C650 (20x family)", DEFAULT_RCLK, PUC_PORT_2S1P, 0x10, 4, 0, }, { 0x131f, 0x2062, 0xffff, 0, "SIIG Cyber 2S1P PCI 16C850 (20x family)", DEFAULT_RCLK, PUC_PORT_2S1P, 0x10, 4, 0, }, { 0x131f, 0x2081, 0xffff, 0, "SIIG PS8000 8S PCI 16C650 (20x family)", DEFAULT_RCLK, PUC_PORT_8S, 0x10, -1, -1, .config_function = puc_config_siig }, { 0x135c, 0x0010, 0xffff, 0, "Quatech QSC-100", -3, /* max 8x clock rate */ PUC_PORT_4S, 0x14, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x0020, 0xffff, 0, "Quatech DSC-100", -1, /* max 2x clock rate */ PUC_PORT_2S, 0x14, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x0030, 0xffff, 0, "Quatech DSC-200/300", -1, /* max 2x clock rate */ PUC_PORT_2S, 0x14, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x0040, 0xffff, 0, "Quatech QSC-200/300", -3, /* max 8x clock rate */ PUC_PORT_4S, 0x14, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x0050, 0xffff, 0, "Quatech ESC-100D", -3, /* max 8x clock rate */ PUC_PORT_8S, 0x14, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x0060, 0xffff, 0, "Quatech ESC-100M", -3, /* max 8x clock rate */ PUC_PORT_8S, 0x14, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x0170, 0xffff, 0, "Quatech QSCLP-100", -1, /* max 2x clock rate */ PUC_PORT_4S, 0x18, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x0180, 0xffff, 0, "Quatech DSCLP-100", -1, /* max 3x clock rate */ PUC_PORT_2S, 0x18, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x01b0, 0xffff, 0, "Quatech DSCLP-200/300", -1, /* max 2x clock rate */ PUC_PORT_2S, 0x18, 0, 8, .config_function = puc_config_quatech }, { 0x135c, 0x01e0, 0xffff, 0, "Quatech ESCLP-100", -3, /* max 8x clock rate */ PUC_PORT_8S, 0x10, 0, 8, .config_function = puc_config_quatech }, { 0x1393, 0x1024, 0xffff, 0, "Moxa Technologies, Smartio CP-102E/PCIe", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x14, 0, -1, .config_function = puc_config_moxa }, { 0x1393, 0x1025, 0xffff, 0, "Moxa Technologies, Smartio CP-102EL/PCIe", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x14, 0, -1, .config_function = puc_config_moxa }, { 0x1393, 0x1040, 0xffff, 0, "Moxa Technologies, Smartio C104H/PCI", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x18, 0, 8, }, { 0x1393, 0x1041, 0xffff, 0, "Moxa Technologies, Smartio CP-104UL/PCI", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x18, 0, 8, }, { 0x1393, 0x1042, 0xffff, 0, "Moxa Technologies, Smartio CP-104JU/PCI", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x18, 0, 8, }, { 0x1393, 0x1043, 0xffff, 0, "Moxa Technologies, Smartio CP-104EL/PCIe", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x18, 0, 8, }, { 0x1393, 0x1045, 0xffff, 0, "Moxa Technologies, Smartio CP-104EL-A/PCIe", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x14, 0, -1, .config_function = puc_config_moxa }, { 0x1393, 0x1120, 0xffff, 0, "Moxa Technologies, CP-112UL", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x18, 0, 8, }, { 0x1393, 0x1141, 0xffff, 0, "Moxa Technologies, Industio CP-114", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x18, 0, 8, }, { 0x1393, 0x1144, 0xffff, 0, "Moxa Technologies, Smartio CP-114EL/PCIe", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x14, 0, -1, .config_function = puc_config_moxa }, { 0x1393, 0x1182, 0xffff, 0, "Moxa Technologies, Smartio CP-118EL-A/PCIe", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x14, 0, -1, .config_function = puc_config_moxa }, { 0x1393, 0x1680, 0xffff, 0, "Moxa Technologies, C168H/PCI", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x18, 0, 8, }, { 0x1393, 0x1681, 0xffff, 0, "Moxa Technologies, C168U/PCI", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x18, 0, 8, }, { 0x1393, 0x1682, 0xffff, 0, "Moxa Technologies, CP-168EL/PCIe", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x18, 0, 8, }, { 0x1393, 0x1683, 0xffff, 0, "Moxa Technologies, Smartio CP-168EL-A/PCIe", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x14, 0, -1, .config_function = puc_config_moxa }, { 0x13a8, 0x0152, 0xffff, 0, "Exar XR17C/D152", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x13a8, 0x0154, 0xffff, 0, "Exar XR17C154", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x13a8, 0x0158, 0xffff, 0, "Exar XR17C158", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x13a8, 0x0258, 0xffff, 0, "Exar XR17V258IV", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x10, 0, -1, .config_function = puc_config_exar }, { 0x13a8, 0x0352, 0xffff, 0, "Exar XR17V352", 125000000, PUC_PORT_2S, 0x10, 0, -1, .config_function = puc_config_exar_pcie }, /* The XR17V358 uses the 125MHz PCIe clock as its reference clock. */ { 0x13a8, 0x0358, 0xffff, 0, "Exar XR17V358", 125000000, PUC_PORT_8S, 0x10, 0, -1, .config_function = puc_config_exar_pcie }, /* * The Advantech PCI-1602 Rev. A use the first two ports of an Oxford * Semiconductor OXuPCI954. Note these boards have a hardware bug in * that they drive the RS-422/485 transmitters after power-on until a - * driver initalizes the UARTs. + * driver initializes the UARTs. */ { 0x13fe, 0x1600, 0x1602, 0x0002, "Advantech PCI-1602 Rev. A", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, 8, .config_function = puc_config_advantech }, /* Advantech PCI-1602 Rev. B1/PCI-1603 are also based on OXuPCI952. */ { 0x13fe, 0xa102, 0x13fe, 0xa102, "Advantech 2-port PCI (PCI-1602 Rev. B1/PCI-1603)", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 4, 0, .config_function = puc_config_advantech }, { 0x1407, 0x0100, 0xffff, 0, "Lava Computers Dual Serial", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x1407, 0x0101, 0xffff, 0, "Lava Computers Quatro A", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x1407, 0x0102, 0xffff, 0, "Lava Computers Quatro B", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x1407, 0x0120, 0xffff, 0, "Lava Computers Quattro-PCI A", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x1407, 0x0121, 0xffff, 0, "Lava Computers Quattro-PCI B", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x1407, 0x0180, 0xffff, 0, "Lava Computers Octo A", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x1407, 0x0181, 0xffff, 0, "Lava Computers Octo B", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x1409, 0x7268, 0xffff, 0, "Sunix SUN1888", 0, PUC_PORT_2P, 0x10, 0, 8, }, { 0x1409, 0x7168, 0xffff, 0, NULL, DEFAULT_RCLK * 8, PUC_PORT_NONSTANDARD, 0x10, -1, -1, .config_function = puc_config_timedia }, /* * Boards with an Oxford Semiconductor chip. * * Oxford Semiconductor provides documentation for their chip at: * * * As sold by Kouwell . * I/O Flex PCI I/O Card Model-223 with 4 serial and 1 parallel ports. */ { 0x1415, 0x9501, 0x10fc, 0xc070, "I-O DATA RSA-PCI2/R", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, 8, }, { 0x1415, 0x9501, 0x131f, 0x2050, "SIIG Cyber 4 PCI 16550", DEFAULT_RCLK * 10, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1415, 0x9501, 0x131f, 0x2051, "SIIG Cyber 4S PCI 16C650 (20x family)", DEFAULT_RCLK * 10, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1415, 0x9501, 0x131f, 0x2052, "SIIG Quartet Serial 850", DEFAULT_RCLK * 10, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1415, 0x9501, 0x14db, 0x2150, "Kuroutoshikou SERIAL4P-LPPCI2", DEFAULT_RCLK * 10, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1415, 0x9501, 0xffff, 0, "Oxford Semiconductor OX16PCI954 UARTs", 0, PUC_PORT_4S, 0x10, 0, 8, .config_function = puc_config_oxford_pci954 }, { 0x1415, 0x950a, 0x131f, 0x2030, "SIIG Cyber 2S PCIe", DEFAULT_RCLK * 10, PUC_PORT_2S, 0x10, 0, 8, }, { 0x1415, 0x950a, 0x131f, 0x2032, "SIIG Cyber Serial Dual PCI 16C850", DEFAULT_RCLK * 10, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1415, 0x950a, 0x131f, 0x2061, "SIIG Cyber 2SP1 PCIe", DEFAULT_RCLK * 10, PUC_PORT_2S, 0x10, 0, 8, }, { 0x1415, 0x950a, 0xffff, 0, "Oxford Semiconductor OX16PCI954 UARTs", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1415, 0x9511, 0xffff, 0, "Oxford Semiconductor OX9160/OX16PCI954 UARTs (function 1)", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1415, 0x9521, 0xffff, 0, "Oxford Semiconductor OX16PCI952 UARTs", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x1415, 0x9538, 0xffff, 0, "Oxford Semiconductor OX16PCI958 UARTs", DEFAULT_RCLK, PUC_PORT_8S, 0x18, 0, 8, }, /* * Perle boards use Oxford Semiconductor chips, but they store the * Oxford Semiconductor device ID as a subvendor device ID and use * their own device IDs. */ { 0x155f, 0x0331, 0xffff, 0, "Perle Ultraport4 Express", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, 8, }, { 0x155f, 0xB012, 0xffff, 0, "Perle Speed2 LE", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, 8, }, { 0x155f, 0xB022, 0xffff, 0, "Perle Speed2 LE", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, 8, }, { 0x155f, 0xB004, 0xffff, 0, "Perle Speed4 LE", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, 8, }, { 0x155f, 0xB008, 0xffff, 0, "Perle Speed8 LE", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x10, 0, 8, }, /* * Oxford Semiconductor PCI Express Expresso family * * Found in many 'native' PCI Express serial boards such as: * * eMegatech MP954ER4 (4 port) and MP958ER8 (8 port) * * * Lindy 51189 (4 port) * * * StarTech.com PEX4S952 (4 port) and PEX8S952 (8 port) * */ { 0x1415, 0xc11b, 0xffff, 0, "Oxford Semiconductor OXPCIe952 1S1P", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x1415, 0xc138, 0xffff, 0, "Oxford Semiconductor OXPCIe952 UARTs", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x1415, 0xc158, 0xffff, 0, "Oxford Semiconductor OXPCIe952 UARTs", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x1415, 0xc15d, 0xffff, 0, "Oxford Semiconductor OXPCIe952 UARTs (function 1)", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x1415, 0xc208, 0xffff, 0, "Oxford Semiconductor OXPCIe954 UARTs", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x1415, 0xc20d, 0xffff, 0, "Oxford Semiconductor OXPCIe954 UARTs (function 1)", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x1415, 0xc308, 0xffff, 0, "Oxford Semiconductor OXPCIe958 UARTs", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x1415, 0xc30d, 0xffff, 0, "Oxford Semiconductor OXPCIe958 UARTs (function 1)", DEFAULT_RCLK * 0x22, PUC_PORT_NONSTANDARD, 0x10, 0, -1, .config_function = puc_config_oxford_pcie }, { 0x14d2, 0x8010, 0xffff, 0, "VScom PCI-100L", DEFAULT_RCLK * 8, PUC_PORT_1S, 0x14, 0, 0, }, { 0x14d2, 0x8020, 0xffff, 0, "VScom PCI-200L", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x14, 4, 0, }, { 0x14d2, 0x8028, 0xffff, 0, "VScom 200Li", DEFAULT_RCLK, PUC_PORT_2S, 0x20, 0, 8, }, /* * VScom (Titan?) PCI-800L. More modern variant of the * PCI-800. Uses 6 discrete 16550 UARTs, plus another * two of them obviously implemented as macro cells in * the ASIC. This causes the weird port access pattern * below, where two of the IO port ranges each access * one of the ASIC UARTs, and a block of IO addresses * access the external UARTs. */ { 0x14d2, 0x8080, 0xffff, 0, "Titan VScom PCI-800L", DEFAULT_RCLK * 8, PUC_PORT_8S, 0x14, -1, -1, .config_function = puc_config_titan }, /* * VScom PCI-800H. Uses 8 16950 UART, behind a PCI chips that offers * 4 com port on PCI device 0 and 4 on PCI device 1. PCI device 0 has * device ID 3 and PCI device 1 device ID 4. */ { 0x14d2, 0xa003, 0xffff, 0, "Titan PCI-800H", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, 8, }, { 0x14d2, 0xa004, 0xffff, 0, "Titan PCI-800H", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, 8, }, { 0x14d2, 0xa005, 0xffff, 0, "Titan PCI-200H", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, 8, }, { 0x14d2, 0xe020, 0xffff, 0, "Titan VScom PCI-200HV2", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 4, 0, }, { 0x14d2, 0xa007, 0xffff, 0, "Titan VScom PCIex-800H", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, 8, }, { 0x14d2, 0xa008, 0xffff, 0, "Titan VScom PCIex-800H", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, 8, }, { 0x14db, 0x2130, 0xffff, 0, "Avlab Technology, PCI IO 2S", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x14db, 0x2150, 0xffff, 0, "Avlab Low Profile PCI 4 Serial", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x14db, 0x2152, 0xffff, 0, "Avlab Low Profile PCI 4 Serial", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x1592, 0x0781, 0xffff, 0, "Syba Tech Ltd. PCI-4S2P-550-ECP", DEFAULT_RCLK, PUC_PORT_4S1P, 0x10, 0, -1, .config_function = puc_config_syba }, { 0x1fd4, 0x1999, 0x1fd4, 0x0002, "Sunix SER5xxxx 2-port serial", DEFAULT_RCLK * 8, PUC_PORT_2S, 0x10, 0, 8, }, { 0x1fd4, 0x1999, 0x1fd4, 0x0004, "Sunix SER5xxxx 4-port serial", DEFAULT_RCLK * 8, PUC_PORT_4S, 0x10, 0, 8, }, { 0x1fd4, 0x1999, 0x1fd4, 0x0008, "Sunix SER5xxxx 8-port serial", DEFAULT_RCLK * 8, PUC_PORT_8S, -1, -1, -1, .config_function = puc_config_sunix }, { 0x1fd4, 0x1999, 0x1fd4, 0x0101, "Sunix MIO5xxxx 1-port serial and 1284 Printer port", DEFAULT_RCLK * 8, PUC_PORT_1S1P, -1, -1, -1, .config_function = puc_config_sunix }, { 0x1fd4, 0x1999, 0x1fd4, 0x0102, "Sunix MIO5xxxx 2-port serial and 1284 Printer port", DEFAULT_RCLK * 8, PUC_PORT_2S1P, -1, -1, -1, .config_function = puc_config_sunix }, { 0x1fd4, 0x1999, 0x1fd4, 0x0104, "Sunix MIO5xxxx 4-port serial and 1284 Printer port", DEFAULT_RCLK * 8, PUC_PORT_4S1P, -1, -1, -1, .config_function = puc_config_sunix }, { 0x5372, 0x6872, 0xffff, 0, "Feasso PCI FPP-02 2S1P", DEFAULT_RCLK, PUC_PORT_2S1P, 0x10, 4, 0, }, { 0x5372, 0x6873, 0xffff, 0, "Sun 1040 PCI Quad Serial", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x6666, 0x0001, 0xffff, 0, "Decision Computer Inc, PCCOM 4-port serial", DEFAULT_RCLK, PUC_PORT_4S, 0x1c, 0, 8, }, { 0x6666, 0x0002, 0xffff, 0, "Decision Computer Inc, PCCOM 8-port serial", DEFAULT_RCLK, PUC_PORT_8S, 0x1c, 0, 8, }, { 0x6666, 0x0004, 0xffff, 0, "PCCOM dual port RS232/422/485", DEFAULT_RCLK, PUC_PORT_2S, 0x1c, 0, 8, }, { 0x9710, 0x9815, 0xffff, 0, "NetMos NM9815 Dual 1284 Printer port", 0, PUC_PORT_2P, 0x10, 8, 0, }, /* * This is more specific than the generic NM9835 entry, and is placed * here to _prevent_ puc(4) from claiming this single port card. * * uart(4) will claim this device. */ { 0x9710, 0x9835, 0x1000, 1, "NetMos NM9835 based 1-port serial", DEFAULT_RCLK, PUC_PORT_1S, 0x10, 4, 0, }, { 0x9710, 0x9835, 0x1000, 2, "NetMos NM9835 based 2-port serial", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x9710, 0x9835, 0xffff, 0, "NetMos NM9835 Dual UART and 1284 Printer port", DEFAULT_RCLK, PUC_PORT_2S1P, 0x10, 4, 0, }, { 0x9710, 0x9845, 0x1000, 0x0006, "NetMos NM9845 6 Port UART", DEFAULT_RCLK, PUC_PORT_6S, 0x10, 4, 0, }, { 0x9710, 0x9845, 0xffff, 0, "NetMos NM9845 Quad UART and 1284 Printer port", DEFAULT_RCLK, PUC_PORT_4S1P, 0x10, 4, 0, }, { 0x9710, 0x9865, 0xa000, 0x3002, "NetMos NM9865 Dual UART", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 4, 0, }, { 0x9710, 0x9865, 0xa000, 0x3003, "NetMos NM9865 Triple UART", DEFAULT_RCLK, PUC_PORT_3S, 0x10, 4, 0, }, { 0x9710, 0x9865, 0xa000, 0x3004, "NetMos NM9865 Quad UART", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 4, 0, }, { 0x9710, 0x9865, 0xa000, 0x3011, "NetMos NM9865 Single UART and 1284 Printer port", DEFAULT_RCLK, PUC_PORT_1S1P, 0x10, 4, 0, }, { 0x9710, 0x9865, 0xa000, 0x3012, "NetMos NM9865 Dual UART and 1284 Printer port", DEFAULT_RCLK, PUC_PORT_2S1P, 0x10, 4, 0, }, { 0x9710, 0x9865, 0xa000, 0x3020, "NetMos NM9865 Dual 1284 Printer port", DEFAULT_RCLK, PUC_PORT_2P, 0x10, 4, 0, }, { 0xb00c, 0x021c, 0xffff, 0, "IC Book Labs Gunboat x4 Lite", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 0, 8, .config_function = puc_config_icbook }, { 0xb00c, 0x031c, 0xffff, 0, "IC Book Labs Gunboat x4 Pro", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 0, 8, .config_function = puc_config_icbook }, { 0xb00c, 0x041c, 0xffff, 0, "IC Book Labs Ironclad x8 Lite", DEFAULT_RCLK, PUC_PORT_8S, 0x10, 0, 8, .config_function = puc_config_icbook }, { 0xb00c, 0x051c, 0xffff, 0, "IC Book Labs Ironclad x8 Pro", DEFAULT_RCLK, PUC_PORT_8S, 0x10, 0, 8, .config_function = puc_config_icbook }, { 0xb00c, 0x081c, 0xffff, 0, "IC Book Labs Dreadnought x16 Pro", DEFAULT_RCLK * 8, PUC_PORT_16S, 0x10, 0, 8, .config_function = puc_config_icbook }, { 0xb00c, 0x091c, 0xffff, 0, "IC Book Labs Dreadnought x16 Lite", DEFAULT_RCLK, PUC_PORT_16S, 0x10, 0, 8, .config_function = puc_config_icbook }, { 0xb00c, 0x0a1c, 0xffff, 0, "IC Book Labs Gunboat x2 Low Profile", DEFAULT_RCLK, PUC_PORT_2S, 0x10, 0, 8, }, { 0xb00c, 0x0b1c, 0xffff, 0, "IC Book Labs Gunboat x4 Low Profile", DEFAULT_RCLK, PUC_PORT_4S, 0x10, 0, 8, .config_function = puc_config_icbook }, { 0xffff, 0, 0xffff, 0, NULL, 0 } }; static int puc_config_advantech(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res __unused) { const struct puc_cfg *cfg; struct resource *cres; struct puc_bar *bar; device_t cdev, dev; bus_size_t off; int base, crtype, fixed, high, i, oxpcie; uint8_t acr, func, mask; if (cmd != PUC_CFG_SETUP) return (ENXIO); base = fixed = oxpcie = 0; crtype = SYS_RES_IOPORT; acr = mask = 0x0; func = high = 1; off = 0x60; cfg = sc->sc_cfg; switch (cfg->subvendor) { case 0x13fe: switch (cfg->device) { case 0xa102: high = 0; break; default: break; } default: break; } if (fixed == 1) goto setup; dev = sc->sc_dev; cdev = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), func); if (cdev == NULL) { device_printf(dev, "could not find config function\n"); return (ENXIO); } i = PCIR_BAR(0); cres = bus_alloc_resource_any(cdev, crtype, &i, RF_ACTIVE); if (cres == NULL) { device_printf(dev, "could not allocate config resource\n"); return (ENXIO); } if (oxpcie == 0) { mask = bus_read_1(cres, off); if (pci_get_function(dev) == 1) base = 4; } setup: for (i = 0; i < sc->sc_nports; ++i) { device_printf(dev, "port %d: ", i); bar = puc_get_bar(sc, cfg->rid + i * cfg->d_rid); if (bar == NULL) { printf("could not get BAR\n"); continue; } if (fixed == 0) { if ((mask & (1 << (base + i))) == 0) { acr = 0; printf("RS-232\n"); } else { acr = (high == 1 ? 0x18 : 0x10); printf("RS-422/RS-485, active-%s auto-DTR\n", high == 1 ? "high" : "low"); } } bus_write_1(bar->b_res, REG_SPR, REG_ACR); bus_write_1(bar->b_res, REG_ICR, acr); } bus_release_resource(cdev, crtype, rman_get_rid(cres), cres); return (0); } static int puc_config_amc(struct puc_softc *sc __unused, enum puc_cfg_cmd cmd, int port, intptr_t *res) { switch (cmd) { case PUC_CFG_GET_OFS: *res = 8 * (port & 1); return (0); case PUC_CFG_GET_RID: *res = 0x14 + (port >> 1) * 4; return (0); default: break; } return (ENXIO); } static int puc_config_diva(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res) { const struct puc_cfg *cfg = sc->sc_cfg; if (cmd == PUC_CFG_GET_OFS) { if (cfg->subdevice == 0x1282) /* Everest SP */ port <<= 1; else if (cfg->subdevice == 0x104b) /* Maestro SP2 */ port = (port == 3) ? 4 : port; *res = port * 8 + ((port > 2) ? 0x18 : 0); return (0); } return (ENXIO); } static int puc_config_exar(struct puc_softc *sc __unused, enum puc_cfg_cmd cmd, int port, intptr_t *res) { if (cmd == PUC_CFG_GET_OFS) { *res = port * 0x200; return (0); } return (ENXIO); } static int puc_config_exar_pcie(struct puc_softc *sc __unused, enum puc_cfg_cmd cmd, int port, intptr_t *res) { if (cmd == PUC_CFG_GET_OFS) { *res = port * 0x400; return (0); } return (ENXIO); } static int puc_config_icbook(struct puc_softc *sc __unused, enum puc_cfg_cmd cmd, int port __unused, intptr_t *res) { if (cmd == PUC_CFG_GET_ILR) { *res = PUC_ILR_DIGI; return (0); } return (ENXIO); } static int puc_config_moxa(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res) { const struct puc_cfg *cfg = sc->sc_cfg; if (cmd == PUC_CFG_GET_OFS) { if (port == 3 && (cfg->device == 0x1045 || cfg->device == 0x1144)) port = 7; *res = port * 0x200; return 0; } return (ENXIO); } static int puc_config_quatech(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port __unused, intptr_t *res) { const struct puc_cfg *cfg = sc->sc_cfg; struct puc_bar *bar; uint8_t v0, v1; switch (cmd) { case PUC_CFG_SETUP: /* * Check if the scratchpad register is enabled or if the * interrupt status and options registers are active. */ bar = puc_get_bar(sc, cfg->rid); if (bar == NULL) return (ENXIO); bus_write_1(bar->b_res, REG_LCR, LCR_DLAB); bus_write_1(bar->b_res, REG_SPR, 0); v0 = bus_read_1(bar->b_res, REG_SPR); bus_write_1(bar->b_res, REG_SPR, 0x80 + -cfg->clock); v1 = bus_read_1(bar->b_res, REG_SPR); bus_write_1(bar->b_res, REG_LCR, 0); sc->sc_cfg_data = (v0 << 8) | v1; if (v0 == 0 && v1 == 0x80 + -cfg->clock) { /* * The SPR register echoed the two values written * by us. This means that the SPAD jumper is set. */ device_printf(sc->sc_dev, "warning: extra features " "not usable -- SPAD compatibility enabled\n"); return (0); } if (v0 != 0) { /* * The first value doesn't match. This can only mean * that the SPAD jumper is not set and that a non- * standard fixed clock multiplier jumper is set. */ if (bootverbose) device_printf(sc->sc_dev, "fixed clock rate " "multiplier of %d\n", 1 << v0); if (v0 < -cfg->clock) device_printf(sc->sc_dev, "warning: " "suboptimal fixed clock rate multiplier " "setting\n"); return (0); } /* * The first value matched, but the second didn't. We know * that the SPAD jumper is not set. We also know that the * clock rate multiplier is software controlled *and* that * we just programmed it to the maximum allowed. */ if (bootverbose) device_printf(sc->sc_dev, "clock rate multiplier of " "%d selected\n", 1 << -cfg->clock); return (0); case PUC_CFG_GET_CLOCK: v0 = (sc->sc_cfg_data >> 8) & 0xff; v1 = sc->sc_cfg_data & 0xff; if (v0 == 0 && v1 == 0x80 + -cfg->clock) { /* * XXX With the SPAD jumper applied, there's no * easy way of knowing if there's also a clock * rate multiplier jumper installed. Let's hope * not ... */ *res = DEFAULT_RCLK; } else if (v0 == 0) { /* * No clock rate multiplier jumper installed, * so we programmed the board with the maximum * multiplier allowed as given to us in the * clock field of the config record (negated). */ *res = DEFAULT_RCLK << -cfg->clock; } else *res = DEFAULT_RCLK << v0; return (0); case PUC_CFG_GET_ILR: v0 = (sc->sc_cfg_data >> 8) & 0xff; v1 = sc->sc_cfg_data & 0xff; *res = (v0 == 0 && v1 == 0x80 + -cfg->clock) ? PUC_ILR_NONE : PUC_ILR_QUATECH; return (0); default: break; } return (ENXIO); } static int puc_config_syba(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res) { static int base[] = { 0x251, 0x3f0, 0 }; const struct puc_cfg *cfg = sc->sc_cfg; struct puc_bar *bar; int efir, idx, ofs; uint8_t v; switch (cmd) { case PUC_CFG_SETUP: bar = puc_get_bar(sc, cfg->rid); if (bar == NULL) return (ENXIO); /* configure both W83877TFs */ bus_write_1(bar->b_res, 0x250, 0x89); bus_write_1(bar->b_res, 0x3f0, 0x87); bus_write_1(bar->b_res, 0x3f0, 0x87); idx = 0; while (base[idx] != 0) { efir = base[idx]; bus_write_1(bar->b_res, efir, 0x09); v = bus_read_1(bar->b_res, efir + 1); if ((v & 0x0f) != 0x0c) return (ENXIO); bus_write_1(bar->b_res, efir, 0x16); v = bus_read_1(bar->b_res, efir + 1); bus_write_1(bar->b_res, efir, 0x16); bus_write_1(bar->b_res, efir + 1, v | 0x04); bus_write_1(bar->b_res, efir, 0x16); bus_write_1(bar->b_res, efir + 1, v & ~0x04); ofs = base[idx] & 0x300; bus_write_1(bar->b_res, efir, 0x23); bus_write_1(bar->b_res, efir + 1, (ofs + 0x78) >> 2); bus_write_1(bar->b_res, efir, 0x24); bus_write_1(bar->b_res, efir + 1, (ofs + 0xf8) >> 2); bus_write_1(bar->b_res, efir, 0x25); bus_write_1(bar->b_res, efir + 1, (ofs + 0xe8) >> 2); bus_write_1(bar->b_res, efir, 0x17); bus_write_1(bar->b_res, efir + 1, 0x03); bus_write_1(bar->b_res, efir, 0x28); bus_write_1(bar->b_res, efir + 1, 0x43); idx++; } bus_write_1(bar->b_res, 0x250, 0xaa); bus_write_1(bar->b_res, 0x3f0, 0xaa); return (0); case PUC_CFG_GET_OFS: switch (port) { case 0: *res = 0x2f8; return (0); case 1: *res = 0x2e8; return (0); case 2: *res = 0x3f8; return (0); case 3: *res = 0x3e8; return (0); case 4: *res = 0x278; return (0); } break; default: break; } return (ENXIO); } static int puc_config_siig(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res) { const struct puc_cfg *cfg = sc->sc_cfg; switch (cmd) { case PUC_CFG_GET_OFS: if (cfg->ports == PUC_PORT_8S) { *res = (port > 4) ? 8 * (port - 4) : 0; return (0); } break; case PUC_CFG_GET_RID: if (cfg->ports == PUC_PORT_8S) { *res = 0x10 + ((port > 4) ? 0x10 : 4 * port); return (0); } if (cfg->ports == PUC_PORT_2S1P) { switch (port) { case 0: *res = 0x10; return (0); case 1: *res = 0x14; return (0); case 2: *res = 0x1c; return (0); } } break; default: break; } return (ENXIO); } static int puc_config_timedia(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res) { static const uint16_t dual[] = { 0x0002, 0x4036, 0x4037, 0x4038, 0x4078, 0x4079, 0x4085, 0x4088, 0x4089, 0x5037, 0x5078, 0x5079, 0x5085, 0x6079, 0x7079, 0x8079, 0x8137, 0x8138, 0x8237, 0x8238, 0x9079, 0x9137, 0x9138, 0x9237, 0x9238, 0xA079, 0xB079, 0xC079, 0xD079, 0 }; static const uint16_t quad[] = { 0x4055, 0x4056, 0x4095, 0x4096, 0x5056, 0x8156, 0x8157, 0x8256, 0x8257, 0x9056, 0x9156, 0x9157, 0x9158, 0x9159, 0x9256, 0x9257, 0xA056, 0xA157, 0xA158, 0xA159, 0xB056, 0xB157, 0 }; static const uint16_t octa[] = { 0x4065, 0x4066, 0x5065, 0x5066, 0x8166, 0x9066, 0x9166, 0x9167, 0x9168, 0xA066, 0xA167, 0xA168, 0 }; static const struct { int ports; const uint16_t *ids; } subdevs[] = { { 2, dual }, { 4, quad }, { 8, octa }, { 0, NULL } }; static char desc[64]; int dev, id; uint16_t subdev; switch (cmd) { case PUC_CFG_GET_CLOCK: if (port < 2) *res = DEFAULT_RCLK * 8; else *res = DEFAULT_RCLK; return (0); case PUC_CFG_GET_DESC: snprintf(desc, sizeof(desc), "Timedia technology %d Port Serial", (int)sc->sc_cfg_data); *res = (intptr_t)desc; return (0); case PUC_CFG_GET_NPORTS: subdev = pci_get_subdevice(sc->sc_dev); dev = 0; while (subdevs[dev].ports != 0) { id = 0; while (subdevs[dev].ids[id] != 0) { if (subdev == subdevs[dev].ids[id]) { sc->sc_cfg_data = subdevs[dev].ports; *res = sc->sc_cfg_data; return (0); } id++; } dev++; } return (ENXIO); case PUC_CFG_GET_OFS: *res = (port == 1 || port == 3) ? 8 : 0; return (0); case PUC_CFG_GET_RID: *res = 0x10 + ((port > 3) ? port - 2 : port >> 1) * 4; return (0); case PUC_CFG_GET_TYPE: *res = PUC_TYPE_SERIAL; return (0); default: break; } return (ENXIO); } static int puc_config_oxford_pci954(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port __unused, intptr_t *res) { switch (cmd) { case PUC_CFG_GET_CLOCK: /* * OXu16PCI954 use a 14.7456 MHz clock by default while * OX16PCI954 and OXm16PCI954 employ a 1.8432 MHz one. */ if (pci_get_revid(sc->sc_dev) == 1) *res = DEFAULT_RCLK * 8; else *res = DEFAULT_RCLK; return (0); default: break; } return (ENXIO); } static int puc_config_oxford_pcie(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res) { const struct puc_cfg *cfg = sc->sc_cfg; int idx; struct puc_bar *bar; uint8_t value; switch (cmd) { case PUC_CFG_SETUP: device_printf(sc->sc_dev, "%d UARTs detected\n", sc->sc_nports); /* Set UARTs to enhanced mode */ bar = puc_get_bar(sc, cfg->rid); if (bar == NULL) return (ENXIO); for (idx = 0; idx < sc->sc_nports; idx++) { value = bus_read_1(bar->b_res, 0x1000 + (idx << 9) + 0x92); bus_write_1(bar->b_res, 0x1000 + (idx << 9) + 0x92, value | 0x10); } return (0); case PUC_CFG_GET_LEN: *res = 0x200; return (0); case PUC_CFG_GET_NPORTS: /* * Check if we are being called from puc_bfe_attach() * or puc_bfe_probe(). If puc_bfe_probe(), we cannot * puc_get_bar(), so we return a value of 16. This has * cosmetic side-effects at worst; in PUC_CFG_GET_DESC, * sc->sc_cfg_data will not contain the true number of * ports in PUC_CFG_GET_DESC, but we are not implementing * that call for this device family anyway. * * The check is for initialization of sc->sc_bar[idx], * which is only done in puc_bfe_attach(). */ idx = 0; do { if (sc->sc_bar[idx++].b_rid != -1) { sc->sc_cfg_data = 16; *res = sc->sc_cfg_data; return (0); } } while (idx < PUC_PCI_BARS); bar = puc_get_bar(sc, cfg->rid); if (bar == NULL) return (ENXIO); value = bus_read_1(bar->b_res, 0x04); if (value == 0) return (ENXIO); sc->sc_cfg_data = value; *res = sc->sc_cfg_data; return (0); case PUC_CFG_GET_OFS: *res = 0x1000 + (port << 9); return (0); case PUC_CFG_GET_TYPE: *res = PUC_TYPE_SERIAL; return (0); default: break; } return (ENXIO); } static int puc_config_sunix(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, intptr_t *res) { int error; switch (cmd) { case PUC_CFG_GET_OFS: error = puc_config(sc, PUC_CFG_GET_TYPE, port, res); if (error != 0) return (error); *res = (*res == PUC_TYPE_SERIAL) ? (port & 3) * 8 : 0; return (0); case PUC_CFG_GET_RID: error = puc_config(sc, PUC_CFG_GET_TYPE, port, res); if (error != 0) return (error); *res = (*res == PUC_TYPE_SERIAL && port <= 3) ? 0x10 : 0x14; return (0); default: break; } return (ENXIO); } static int puc_config_titan(struct puc_softc *sc __unused, enum puc_cfg_cmd cmd, int port, intptr_t *res) { switch (cmd) { case PUC_CFG_GET_OFS: *res = (port < 3) ? 0 : (port - 2) << 3; return (0); case PUC_CFG_GET_RID: *res = 0x14 + ((port >= 2) ? 0x0c : port << 2); return (0); default: break; } return (ENXIO); } Index: head/sys/dev/usb/net/if_axe.c =================================================================== --- head/sys/dev/usb/net/if_axe.c (revision 357663) +++ head/sys/dev/usb/net/if_axe.c (revision 357664) @@ -1,1510 +1,1510 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1997, 1998, 1999, 2000-2003 * Bill Paul . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Bill Paul. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY Bill Paul AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * ASIX Electronics AX88172/AX88178/AX88778 USB 2.0 ethernet driver. * Used in the LinkSys USB200M and various other adapters. * * Manuals available from: * http://www.asix.com.tw/datasheet/mac/Ax88172.PDF * Note: you need the manual for the AX88170 chip (USB 1.x ethernet * controller) to find the definitions for the RX control register. * http://www.asix.com.tw/datasheet/mac/Ax88170.PDF * * Written by Bill Paul * Senior Engineer * Wind River Systems */ /* * The AX88172 provides USB ethernet supports at 10 and 100Mbps. * It uses an external PHY (reference designs use a RealTek chip), * and has a 64-bit multicast hash filter. There is some information * missing from the manual which one needs to know in order to make * the chip function: * * - You must set bit 7 in the RX control register, otherwise the * chip won't receive any packets. * - You must initialize all 3 IPG registers, or you won't be able * to send any packets. * * Note that this device appears to only support loading the station - * address via autload from the EEPROM (i.e. there's no way to manaully + * address via autload from the EEPROM (i.e. there's no way to manually * set it). * * (Adam Weinberger wanted me to name this driver if_gir.c.) */ /* * Ax88178 and Ax88772 support backported from the OpenBSD driver. * 2007/02/12, J.R. Oldroyd, fbsd@opal.com * * Manual here: * http://www.asix.com.tw/FrootAttach/datasheet/AX88178_datasheet_Rev10.pdf * http://www.asix.com.tw/FrootAttach/datasheet/AX88772_datasheet_Rev10.pdf */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "usbdevs.h" #define USB_DEBUG_VAR axe_debug #include #include #include #include #include "miibus_if.h" /* * AXE_178_MAX_FRAME_BURST * max frame burst size for Ax88178 and Ax88772 * 0 2048 bytes * 1 4096 bytes * 2 8192 bytes * 3 16384 bytes * use the largest your system can handle without USB stalling. * * NB: 88772 parts appear to generate lots of input errors with * a 2K rx buffer and 8K is only slightly faster than 4K on an * EHCI port on a T42 so change at your own risk. */ #define AXE_178_MAX_FRAME_BURST 1 #define AXE_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP) #ifdef USB_DEBUG static int axe_debug = 0; static SYSCTL_NODE(_hw_usb, OID_AUTO, axe, CTLFLAG_RW, 0, "USB axe"); SYSCTL_INT(_hw_usb_axe, OID_AUTO, debug, CTLFLAG_RWTUN, &axe_debug, 0, "Debug level"); #endif /* * Various supported device vendors/products. */ static const STRUCT_USB_HOST_ID axe_devs[] = { #define AXE_DEV(v,p,i) { USB_VPI(USB_VENDOR_##v, USB_PRODUCT_##v##_##p, i) } AXE_DEV(ABOCOM, UF200, 0), AXE_DEV(ACERCM, EP1427X2, 0), AXE_DEV(APPLE, ETHERNET, AXE_FLAG_772), AXE_DEV(ASIX, AX88172, 0), AXE_DEV(ASIX, AX88178, AXE_FLAG_178), AXE_DEV(ASIX, AX88772, AXE_FLAG_772), AXE_DEV(ASIX, AX88772A, AXE_FLAG_772A), AXE_DEV(ASIX, AX88772B, AXE_FLAG_772B), AXE_DEV(ASIX, AX88772B_1, AXE_FLAG_772B), AXE_DEV(ATEN, UC210T, 0), AXE_DEV(BELKIN, F5D5055, AXE_FLAG_178), AXE_DEV(BILLIONTON, USB2AR, 0), AXE_DEV(CISCOLINKSYS, USB200MV2, AXE_FLAG_772A), AXE_DEV(COREGA, FETHER_USB2_TX, 0), AXE_DEV(DLINK, DUBE100, 0), AXE_DEV(DLINK, DUBE100B1, AXE_FLAG_772), AXE_DEV(DLINK, DUBE100C1, AXE_FLAG_772B), AXE_DEV(GOODWAY, GWUSB2E, 0), AXE_DEV(IODATA, ETGUS2, AXE_FLAG_178), AXE_DEV(JVC, MP_PRX1, 0), AXE_DEV(LENOVO, ETHERNET, AXE_FLAG_772B), AXE_DEV(LINKSYS2, USB200M, 0), AXE_DEV(LINKSYS4, USB1000, AXE_FLAG_178), AXE_DEV(LOGITEC, LAN_GTJU2A, AXE_FLAG_178), AXE_DEV(MELCO, LUAU2KTX, 0), AXE_DEV(MELCO, LUA3U2AGT, AXE_FLAG_178), AXE_DEV(NETGEAR, FA120, 0), AXE_DEV(OQO, ETHER01PLUS, AXE_FLAG_772), AXE_DEV(PLANEX3, GU1000T, AXE_FLAG_178), AXE_DEV(SITECOM, LN029, 0), AXE_DEV(SITECOMEU, LN028, AXE_FLAG_178), AXE_DEV(SITECOMEU, LN031, AXE_FLAG_178), AXE_DEV(SYSTEMTALKS, SGCX2UL, 0), #undef AXE_DEV }; static device_probe_t axe_probe; static device_attach_t axe_attach; static device_detach_t axe_detach; static usb_callback_t axe_bulk_read_callback; static usb_callback_t axe_bulk_write_callback; static miibus_readreg_t axe_miibus_readreg; static miibus_writereg_t axe_miibus_writereg; static miibus_statchg_t axe_miibus_statchg; static uether_fn_t axe_attach_post; static uether_fn_t axe_init; static uether_fn_t axe_stop; static uether_fn_t axe_start; static uether_fn_t axe_tick; static uether_fn_t axe_setmulti; static uether_fn_t axe_setpromisc; static int axe_attach_post_sub(struct usb_ether *); static int axe_ifmedia_upd(struct ifnet *); static void axe_ifmedia_sts(struct ifnet *, struct ifmediareq *); static int axe_cmd(struct axe_softc *, int, int, int, void *); static void axe_ax88178_init(struct axe_softc *); static void axe_ax88772_init(struct axe_softc *); static void axe_ax88772_phywake(struct axe_softc *); static void axe_ax88772a_init(struct axe_softc *); static void axe_ax88772b_init(struct axe_softc *); static int axe_get_phyno(struct axe_softc *, int); static int axe_ioctl(struct ifnet *, u_long, caddr_t); static int axe_rx_frame(struct usb_ether *, struct usb_page_cache *, int); static int axe_rxeof(struct usb_ether *, struct usb_page_cache *, unsigned int offset, unsigned int, struct axe_csum_hdr *); static void axe_csum_cfg(struct usb_ether *); static const struct usb_config axe_config[AXE_N_TRANSFER] = { [AXE_BULK_DT_WR] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_OUT, .frames = 16, .bufsize = 16 * MCLBYTES, .flags = {.pipe_bof = 1,.force_short_xfer = 1,}, .callback = axe_bulk_write_callback, .timeout = 10000, /* 10 seconds */ }, [AXE_BULK_DT_RD] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .bufsize = 16384, /* bytes */ .flags = {.pipe_bof = 1,.short_xfer_ok = 1,}, .callback = axe_bulk_read_callback, .timeout = 0, /* no timeout */ }, }; static const struct ax88772b_mfb ax88772b_mfb_table[] = { { 0x8000, 0x8001, 2048 }, { 0x8100, 0x8147, 4096}, { 0x8200, 0x81EB, 6144}, { 0x8300, 0x83D7, 8192}, { 0x8400, 0x851E, 16384}, { 0x8500, 0x8666, 20480}, { 0x8600, 0x87AE, 24576}, { 0x8700, 0x8A3D, 32768} }; static device_method_t axe_methods[] = { /* Device interface */ DEVMETHOD(device_probe, axe_probe), DEVMETHOD(device_attach, axe_attach), DEVMETHOD(device_detach, axe_detach), /* MII interface */ DEVMETHOD(miibus_readreg, axe_miibus_readreg), DEVMETHOD(miibus_writereg, axe_miibus_writereg), DEVMETHOD(miibus_statchg, axe_miibus_statchg), DEVMETHOD_END }; static driver_t axe_driver = { .name = "axe", .methods = axe_methods, .size = sizeof(struct axe_softc), }; static devclass_t axe_devclass; DRIVER_MODULE(axe, uhub, axe_driver, axe_devclass, NULL, 0); DRIVER_MODULE(miibus, axe, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(axe, uether, 1, 1, 1); MODULE_DEPEND(axe, usb, 1, 1, 1); MODULE_DEPEND(axe, ether, 1, 1, 1); MODULE_DEPEND(axe, miibus, 1, 1, 1); MODULE_VERSION(axe, 1); USB_PNP_HOST_INFO(axe_devs); static const struct usb_ether_methods axe_ue_methods = { .ue_attach_post = axe_attach_post, .ue_attach_post_sub = axe_attach_post_sub, .ue_start = axe_start, .ue_init = axe_init, .ue_stop = axe_stop, .ue_tick = axe_tick, .ue_setmulti = axe_setmulti, .ue_setpromisc = axe_setpromisc, .ue_mii_upd = axe_ifmedia_upd, .ue_mii_sts = axe_ifmedia_sts, }; static int axe_cmd(struct axe_softc *sc, int cmd, int index, int val, void *buf) { struct usb_device_request req; usb_error_t err; AXE_LOCK_ASSERT(sc, MA_OWNED); req.bmRequestType = (AXE_CMD_IS_WRITE(cmd) ? UT_WRITE_VENDOR_DEVICE : UT_READ_VENDOR_DEVICE); req.bRequest = AXE_CMD_CMD(cmd); USETW(req.wValue, val); USETW(req.wIndex, index); USETW(req.wLength, AXE_CMD_LEN(cmd)); err = uether_do_request(&sc->sc_ue, &req, buf, 1000); return (err); } static int axe_miibus_readreg(device_t dev, int phy, int reg) { struct axe_softc *sc = device_get_softc(dev); uint16_t val; int locked; locked = mtx_owned(&sc->sc_mtx); if (!locked) AXE_LOCK(sc); axe_cmd(sc, AXE_CMD_MII_OPMODE_SW, 0, 0, NULL); axe_cmd(sc, AXE_CMD_MII_READ_REG, reg, phy, &val); axe_cmd(sc, AXE_CMD_MII_OPMODE_HW, 0, 0, NULL); val = le16toh(val); if (AXE_IS_772(sc) && reg == MII_BMSR) { /* * BMSR of AX88772 indicates that it supports extended * capability but the extended status register is * revered for embedded ethernet PHY. So clear the * extended capability bit of BMSR. */ val &= ~BMSR_EXTCAP; } if (!locked) AXE_UNLOCK(sc); return (val); } static int axe_miibus_writereg(device_t dev, int phy, int reg, int val) { struct axe_softc *sc = device_get_softc(dev); int locked; val = htole32(val); locked = mtx_owned(&sc->sc_mtx); if (!locked) AXE_LOCK(sc); axe_cmd(sc, AXE_CMD_MII_OPMODE_SW, 0, 0, NULL); axe_cmd(sc, AXE_CMD_MII_WRITE_REG, reg, phy, &val); axe_cmd(sc, AXE_CMD_MII_OPMODE_HW, 0, 0, NULL); if (!locked) AXE_UNLOCK(sc); return (0); } static void axe_miibus_statchg(device_t dev) { struct axe_softc *sc = device_get_softc(dev); struct mii_data *mii = GET_MII(sc); struct ifnet *ifp; uint16_t val; int err, locked; locked = mtx_owned(&sc->sc_mtx); if (!locked) AXE_LOCK(sc); ifp = uether_getifp(&sc->sc_ue); if (mii == NULL || ifp == NULL || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) goto done; sc->sc_flags &= ~AXE_FLAG_LINK; if ((mii->mii_media_status & (IFM_ACTIVE | IFM_AVALID)) == (IFM_ACTIVE | IFM_AVALID)) { switch (IFM_SUBTYPE(mii->mii_media_active)) { case IFM_10_T: case IFM_100_TX: sc->sc_flags |= AXE_FLAG_LINK; break; case IFM_1000_T: if ((sc->sc_flags & AXE_FLAG_178) == 0) break; sc->sc_flags |= AXE_FLAG_LINK; break; default: break; } } /* Lost link, do nothing. */ if ((sc->sc_flags & AXE_FLAG_LINK) == 0) goto done; val = 0; if ((IFM_OPTIONS(mii->mii_media_active) & IFM_FDX) != 0) { val |= AXE_MEDIA_FULL_DUPLEX; if (AXE_IS_178_FAMILY(sc)) { if ((IFM_OPTIONS(mii->mii_media_active) & IFM_ETH_TXPAUSE) != 0) val |= AXE_178_MEDIA_TXFLOW_CONTROL_EN; if ((IFM_OPTIONS(mii->mii_media_active) & IFM_ETH_RXPAUSE) != 0) val |= AXE_178_MEDIA_RXFLOW_CONTROL_EN; } } if (AXE_IS_178_FAMILY(sc)) { val |= AXE_178_MEDIA_RX_EN | AXE_178_MEDIA_MAGIC; if ((sc->sc_flags & AXE_FLAG_178) != 0) val |= AXE_178_MEDIA_ENCK; switch (IFM_SUBTYPE(mii->mii_media_active)) { case IFM_1000_T: val |= AXE_178_MEDIA_GMII | AXE_178_MEDIA_ENCK; break; case IFM_100_TX: val |= AXE_178_MEDIA_100TX; break; case IFM_10_T: /* doesn't need to be handled */ break; } } err = axe_cmd(sc, AXE_CMD_WRITE_MEDIA, 0, val, NULL); if (err) device_printf(dev, "media change failed, error %d\n", err); done: if (!locked) AXE_UNLOCK(sc); } /* * Set media options. */ static int axe_ifmedia_upd(struct ifnet *ifp) { struct axe_softc *sc = ifp->if_softc; struct mii_data *mii = GET_MII(sc); struct mii_softc *miisc; int error; AXE_LOCK_ASSERT(sc, MA_OWNED); LIST_FOREACH(miisc, &mii->mii_phys, mii_list) PHY_RESET(miisc); error = mii_mediachg(mii); return (error); } /* * Report current media status. */ static void axe_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct axe_softc *sc = ifp->if_softc; struct mii_data *mii = GET_MII(sc); AXE_LOCK(sc); mii_pollstat(mii); ifmr->ifm_active = mii->mii_media_active; ifmr->ifm_status = mii->mii_media_status; AXE_UNLOCK(sc); } static u_int axe_hash_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { uint8_t *hashtbl = arg; uint32_t h; h = ether_crc32_be(LLADDR(sdl), ETHER_ADDR_LEN) >> 26; hashtbl[h / 8] |= 1 << (h % 8); return (1); } static void axe_setmulti(struct usb_ether *ue) { struct axe_softc *sc = uether_getsc(ue); struct ifnet *ifp = uether_getifp(ue); uint16_t rxmode; uint8_t hashtbl[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; AXE_LOCK_ASSERT(sc, MA_OWNED); axe_cmd(sc, AXE_CMD_RXCTL_READ, 0, 0, &rxmode); rxmode = le16toh(rxmode); if (ifp->if_flags & (IFF_ALLMULTI | IFF_PROMISC)) { rxmode |= AXE_RXCMD_ALLMULTI; axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, rxmode, NULL); return; } rxmode &= ~AXE_RXCMD_ALLMULTI; if_foreach_llmaddr(ifp, axe_hash_maddr, &hashtbl); axe_cmd(sc, AXE_CMD_WRITE_MCAST, 0, 0, (void *)&hashtbl); axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, rxmode, NULL); } static int axe_get_phyno(struct axe_softc *sc, int sel) { int phyno; switch (AXE_PHY_TYPE(sc->sc_phyaddrs[sel])) { case PHY_TYPE_100_HOME: case PHY_TYPE_GIG: phyno = AXE_PHY_NO(sc->sc_phyaddrs[sel]); break; case PHY_TYPE_SPECIAL: /* FALLTHROUGH */ case PHY_TYPE_RSVD: /* FALLTHROUGH */ case PHY_TYPE_NON_SUP: /* FALLTHROUGH */ default: phyno = -1; break; } return (phyno); } #define AXE_GPIO_WRITE(x, y) do { \ axe_cmd(sc, AXE_CMD_WRITE_GPIO, 0, (x), NULL); \ uether_pause(ue, (y)); \ } while (0) static void axe_ax88178_init(struct axe_softc *sc) { struct usb_ether *ue; int gpio0, ledmode, phymode; uint16_t eeprom, val; ue = &sc->sc_ue; axe_cmd(sc, AXE_CMD_SROM_WR_ENABLE, 0, 0, NULL); /* XXX magic */ axe_cmd(sc, AXE_CMD_SROM_READ, 0, 0x0017, &eeprom); eeprom = le16toh(eeprom); axe_cmd(sc, AXE_CMD_SROM_WR_DISABLE, 0, 0, NULL); /* if EEPROM is invalid we have to use to GPIO0 */ if (eeprom == 0xffff) { phymode = AXE_PHY_MODE_MARVELL; gpio0 = 1; ledmode = 0; } else { phymode = eeprom & 0x7f; gpio0 = (eeprom & 0x80) ? 0 : 1; ledmode = eeprom >> 8; } if (bootverbose) device_printf(sc->sc_ue.ue_dev, "EEPROM data : 0x%04x, phymode : 0x%02x\n", eeprom, phymode); /* Program GPIOs depending on PHY hardware. */ switch (phymode) { case AXE_PHY_MODE_MARVELL: if (gpio0 == 1) { AXE_GPIO_WRITE(AXE_GPIO_RELOAD_EEPROM | AXE_GPIO0_EN, hz / 32); AXE_GPIO_WRITE(AXE_GPIO0_EN | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); AXE_GPIO_WRITE(AXE_GPIO0_EN | AXE_GPIO2_EN, hz / 4); AXE_GPIO_WRITE(AXE_GPIO0_EN | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); } else { AXE_GPIO_WRITE(AXE_GPIO_RELOAD_EEPROM | AXE_GPIO1 | AXE_GPIO1_EN, hz / 3); if (ledmode == 1) { AXE_GPIO_WRITE(AXE_GPIO1_EN, hz / 3); AXE_GPIO_WRITE(AXE_GPIO1 | AXE_GPIO1_EN, hz / 3); } else { AXE_GPIO_WRITE(AXE_GPIO1 | AXE_GPIO1_EN | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); AXE_GPIO_WRITE(AXE_GPIO1 | AXE_GPIO1_EN | AXE_GPIO2_EN, hz / 4); AXE_GPIO_WRITE(AXE_GPIO1 | AXE_GPIO1_EN | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); } } break; case AXE_PHY_MODE_CICADA: case AXE_PHY_MODE_CICADA_V2: case AXE_PHY_MODE_CICADA_V2_ASIX: if (gpio0 == 1) AXE_GPIO_WRITE(AXE_GPIO_RELOAD_EEPROM | AXE_GPIO0 | AXE_GPIO0_EN, hz / 32); else AXE_GPIO_WRITE(AXE_GPIO_RELOAD_EEPROM | AXE_GPIO1 | AXE_GPIO1_EN, hz / 32); break; case AXE_PHY_MODE_AGERE: AXE_GPIO_WRITE(AXE_GPIO_RELOAD_EEPROM | AXE_GPIO1 | AXE_GPIO1_EN, hz / 32); AXE_GPIO_WRITE(AXE_GPIO1 | AXE_GPIO1_EN | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); AXE_GPIO_WRITE(AXE_GPIO1 | AXE_GPIO1_EN | AXE_GPIO2_EN, hz / 4); AXE_GPIO_WRITE(AXE_GPIO1 | AXE_GPIO1_EN | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); break; case AXE_PHY_MODE_REALTEK_8211CL: case AXE_PHY_MODE_REALTEK_8211BN: case AXE_PHY_MODE_REALTEK_8251CL: val = gpio0 == 1 ? AXE_GPIO0 | AXE_GPIO0_EN : AXE_GPIO1 | AXE_GPIO1_EN; AXE_GPIO_WRITE(val, hz / 32); AXE_GPIO_WRITE(val | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); AXE_GPIO_WRITE(val | AXE_GPIO2_EN, hz / 4); AXE_GPIO_WRITE(val | AXE_GPIO2 | AXE_GPIO2_EN, hz / 32); if (phymode == AXE_PHY_MODE_REALTEK_8211CL) { axe_miibus_writereg(ue->ue_dev, sc->sc_phyno, 0x1F, 0x0005); axe_miibus_writereg(ue->ue_dev, sc->sc_phyno, 0x0C, 0x0000); val = axe_miibus_readreg(ue->ue_dev, sc->sc_phyno, 0x0001); axe_miibus_writereg(ue->ue_dev, sc->sc_phyno, 0x01, val | 0x0080); axe_miibus_writereg(ue->ue_dev, sc->sc_phyno, 0x1F, 0x0000); } break; default: /* Unknown PHY model or no need to program GPIOs. */ break; } /* soft reset */ axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_CLEAR, NULL); uether_pause(ue, hz / 4); axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_PRL | AXE_178_RESET_MAGIC, NULL); uether_pause(ue, hz / 4); /* Enable MII/GMII/RGMII interface to work with external PHY. */ axe_cmd(sc, AXE_CMD_SW_PHY_SELECT, 0, 0, NULL); uether_pause(ue, hz / 4); axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, 0, NULL); } static void axe_ax88772_init(struct axe_softc *sc) { axe_cmd(sc, AXE_CMD_WRITE_GPIO, 0, 0x00b0, NULL); uether_pause(&sc->sc_ue, hz / 16); if (sc->sc_phyno == AXE_772_PHY_NO_EPHY) { /* ask for the embedded PHY */ axe_cmd(sc, AXE_CMD_SW_PHY_SELECT, 0, 0x01, NULL); uether_pause(&sc->sc_ue, hz / 64); /* power down and reset state, pin reset state */ axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_CLEAR, NULL); uether_pause(&sc->sc_ue, hz / 16); /* power down/reset state, pin operating state */ axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_IPPD | AXE_SW_RESET_PRL, NULL); uether_pause(&sc->sc_ue, hz / 4); /* power up, reset */ axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_PRL, NULL); /* power up, operating */ axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_IPRL | AXE_SW_RESET_PRL, NULL); } else { /* ask for external PHY */ axe_cmd(sc, AXE_CMD_SW_PHY_SELECT, 0, 0x00, NULL); uether_pause(&sc->sc_ue, hz / 64); /* power down internal PHY */ axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_IPPD | AXE_SW_RESET_PRL, NULL); } uether_pause(&sc->sc_ue, hz / 4); axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, 0, NULL); } static void axe_ax88772_phywake(struct axe_softc *sc) { struct usb_ether *ue; ue = &sc->sc_ue; if (sc->sc_phyno == AXE_772_PHY_NO_EPHY) { /* Manually select internal(embedded) PHY - MAC mode. */ axe_cmd(sc, AXE_CMD_SW_PHY_SELECT, 0, AXE_SW_PHY_SELECT_SS_ENB | AXE_SW_PHY_SELECT_EMBEDDED | AXE_SW_PHY_SELECT_SS_MII, NULL); uether_pause(&sc->sc_ue, hz / 32); } else { /* * Manually select external PHY - MAC mode. * Reverse MII/RMII is for AX88772A PHY mode. */ axe_cmd(sc, AXE_CMD_SW_PHY_SELECT, 0, AXE_SW_PHY_SELECT_SS_ENB | AXE_SW_PHY_SELECT_EXT | AXE_SW_PHY_SELECT_SS_MII, NULL); uether_pause(&sc->sc_ue, hz / 32); } /* Take PHY out of power down. */ axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_IPPD | AXE_SW_RESET_IPRL, NULL); uether_pause(&sc->sc_ue, hz / 4); axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_IPRL, NULL); uether_pause(&sc->sc_ue, hz); axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_CLEAR, NULL); uether_pause(&sc->sc_ue, hz / 32); axe_cmd(sc, AXE_CMD_SW_RESET_REG, 0, AXE_SW_RESET_IPRL, NULL); uether_pause(&sc->sc_ue, hz / 32); } static void axe_ax88772a_init(struct axe_softc *sc) { struct usb_ether *ue; ue = &sc->sc_ue; /* Reload EEPROM. */ AXE_GPIO_WRITE(AXE_GPIO_RELOAD_EEPROM, hz / 32); axe_ax88772_phywake(sc); /* Stop MAC. */ axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, 0, NULL); } static void axe_ax88772b_init(struct axe_softc *sc) { struct usb_ether *ue; uint16_t eeprom; uint8_t *eaddr; int i; ue = &sc->sc_ue; /* Reload EEPROM. */ AXE_GPIO_WRITE(AXE_GPIO_RELOAD_EEPROM, hz / 32); /* * Save PHY power saving configuration(high byte) and * clear EEPROM checksum value(low byte). */ axe_cmd(sc, AXE_CMD_SROM_READ, 0, AXE_EEPROM_772B_PHY_PWRCFG, &eeprom); sc->sc_pwrcfg = le16toh(eeprom) & 0xFF00; /* * Auto-loaded default station address from internal ROM is * 00:00:00:00:00:00 such that an explicit access to EEPROM * is required to get real station address. */ eaddr = ue->ue_eaddr; for (i = 0; i < ETHER_ADDR_LEN / 2; i++) { axe_cmd(sc, AXE_CMD_SROM_READ, 0, AXE_EEPROM_772B_NODE_ID + i, &eeprom); eeprom = le16toh(eeprom); *eaddr++ = (uint8_t)(eeprom & 0xFF); *eaddr++ = (uint8_t)((eeprom >> 8) & 0xFF); } /* Wakeup PHY. */ axe_ax88772_phywake(sc); /* Stop MAC. */ axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, 0, NULL); } #undef AXE_GPIO_WRITE static void axe_reset(struct axe_softc *sc) { struct usb_config_descriptor *cd; usb_error_t err; cd = usbd_get_config_descriptor(sc->sc_ue.ue_udev); err = usbd_req_set_config(sc->sc_ue.ue_udev, &sc->sc_mtx, cd->bConfigurationValue); if (err) DPRINTF("reset failed (ignored)\n"); /* Wait a little while for the chip to get its brains in order. */ uether_pause(&sc->sc_ue, hz / 100); /* Reinitialize controller to achieve full reset. */ if (sc->sc_flags & AXE_FLAG_178) axe_ax88178_init(sc); else if (sc->sc_flags & AXE_FLAG_772) axe_ax88772_init(sc); else if (sc->sc_flags & AXE_FLAG_772A) axe_ax88772a_init(sc); else if (sc->sc_flags & AXE_FLAG_772B) axe_ax88772b_init(sc); } static void axe_attach_post(struct usb_ether *ue) { struct axe_softc *sc = uether_getsc(ue); /* * Load PHY indexes first. Needed by axe_xxx_init(). */ axe_cmd(sc, AXE_CMD_READ_PHYID, 0, 0, sc->sc_phyaddrs); if (bootverbose) device_printf(sc->sc_ue.ue_dev, "PHYADDR 0x%02x:0x%02x\n", sc->sc_phyaddrs[0], sc->sc_phyaddrs[1]); sc->sc_phyno = axe_get_phyno(sc, AXE_PHY_SEL_PRI); if (sc->sc_phyno == -1) sc->sc_phyno = axe_get_phyno(sc, AXE_PHY_SEL_SEC); if (sc->sc_phyno == -1) { device_printf(sc->sc_ue.ue_dev, "no valid PHY address found, assuming PHY address 0\n"); sc->sc_phyno = 0; } /* Initialize controller and get station address. */ if (sc->sc_flags & AXE_FLAG_178) { axe_ax88178_init(sc); axe_cmd(sc, AXE_178_CMD_READ_NODEID, 0, 0, ue->ue_eaddr); } else if (sc->sc_flags & AXE_FLAG_772) { axe_ax88772_init(sc); axe_cmd(sc, AXE_178_CMD_READ_NODEID, 0, 0, ue->ue_eaddr); } else if (sc->sc_flags & AXE_FLAG_772A) { axe_ax88772a_init(sc); axe_cmd(sc, AXE_178_CMD_READ_NODEID, 0, 0, ue->ue_eaddr); } else if (sc->sc_flags & AXE_FLAG_772B) { axe_ax88772b_init(sc); } else axe_cmd(sc, AXE_172_CMD_READ_NODEID, 0, 0, ue->ue_eaddr); /* * Fetch IPG values. */ if (sc->sc_flags & (AXE_FLAG_772A | AXE_FLAG_772B)) { /* Set IPG values. */ sc->sc_ipgs[0] = 0x15; sc->sc_ipgs[1] = 0x16; sc->sc_ipgs[2] = 0x1A; } else axe_cmd(sc, AXE_CMD_READ_IPG012, 0, 0, sc->sc_ipgs); } static int axe_attach_post_sub(struct usb_ether *ue) { struct axe_softc *sc; struct ifnet *ifp; u_int adv_pause; int error; sc = uether_getsc(ue); ifp = ue->ue_ifp; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_start = uether_start; ifp->if_ioctl = axe_ioctl; ifp->if_init = uether_init; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; IFQ_SET_READY(&ifp->if_snd); if (AXE_IS_178_FAMILY(sc)) ifp->if_capabilities |= IFCAP_VLAN_MTU; if (sc->sc_flags & AXE_FLAG_772B) { ifp->if_capabilities |= IFCAP_TXCSUM | IFCAP_RXCSUM; ifp->if_hwassist = AXE_CSUM_FEATURES; /* * Checksum offloading of AX88772B also works with VLAN * tagged frames but there is no way to take advantage * of the feature because vlan(4) assumes * IFCAP_VLAN_HWTAGGING is prerequisite condition to * support checksum offloading with VLAN. VLAN hardware * tagging support of AX88772B is very limited so it's * not possible to announce IFCAP_VLAN_HWTAGGING. */ } ifp->if_capenable = ifp->if_capabilities; if (sc->sc_flags & (AXE_FLAG_772A | AXE_FLAG_772B | AXE_FLAG_178)) adv_pause = MIIF_DOPAUSE; else adv_pause = 0; mtx_lock(&Giant); error = mii_attach(ue->ue_dev, &ue->ue_miibus, ifp, uether_ifmedia_upd, ue->ue_methods->ue_mii_sts, BMSR_DEFCAPMASK, sc->sc_phyno, MII_OFFSET_ANY, adv_pause); mtx_unlock(&Giant); return (error); } /* * Probe for a AX88172 chip. */ static int axe_probe(device_t dev) { struct usb_attach_arg *uaa = device_get_ivars(dev); if (uaa->usb_mode != USB_MODE_HOST) return (ENXIO); if (uaa->info.bConfigIndex != AXE_CONFIG_IDX) return (ENXIO); if (uaa->info.bIfaceIndex != AXE_IFACE_IDX) return (ENXIO); return (usbd_lookup_id_by_uaa(axe_devs, sizeof(axe_devs), uaa)); } /* * Attach the interface. Allocate softc structures, do ifmedia * setup and ethernet/BPF attach. */ static int axe_attach(device_t dev) { struct usb_attach_arg *uaa = device_get_ivars(dev); struct axe_softc *sc = device_get_softc(dev); struct usb_ether *ue = &sc->sc_ue; uint8_t iface_index; int error; sc->sc_flags = USB_GET_DRIVER_INFO(uaa); device_set_usb_desc(dev); mtx_init(&sc->sc_mtx, device_get_nameunit(dev), NULL, MTX_DEF); iface_index = AXE_IFACE_IDX; error = usbd_transfer_setup(uaa->device, &iface_index, sc->sc_xfer, axe_config, AXE_N_TRANSFER, sc, &sc->sc_mtx); if (error) { device_printf(dev, "allocating USB transfers failed\n"); goto detach; } ue->ue_sc = sc; ue->ue_dev = dev; ue->ue_udev = uaa->device; ue->ue_mtx = &sc->sc_mtx; ue->ue_methods = &axe_ue_methods; error = uether_ifattach(ue); if (error) { device_printf(dev, "could not attach interface\n"); goto detach; } return (0); /* success */ detach: axe_detach(dev); return (ENXIO); /* failure */ } static int axe_detach(device_t dev) { struct axe_softc *sc = device_get_softc(dev); struct usb_ether *ue = &sc->sc_ue; usbd_transfer_unsetup(sc->sc_xfer, AXE_N_TRANSFER); uether_ifdetach(ue); mtx_destroy(&sc->sc_mtx); return (0); } #if (AXE_BULK_BUF_SIZE >= 0x10000) #error "Please update axe_bulk_read_callback()!" #endif static void axe_bulk_read_callback(struct usb_xfer *xfer, usb_error_t error) { struct axe_softc *sc = usbd_xfer_softc(xfer); struct usb_ether *ue = &sc->sc_ue; struct usb_page_cache *pc; int actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: pc = usbd_xfer_get_frame(xfer, 0); axe_rx_frame(ue, pc, actlen); /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); uether_rxflush(ue); return; default: /* Error */ DPRINTF("bulk read error, %s\n", usbd_errstr(error)); if (error != USB_ERR_CANCELLED) { /* try to clear stall first */ usbd_xfer_set_stall(xfer); goto tr_setup; } return; } } static int axe_rx_frame(struct usb_ether *ue, struct usb_page_cache *pc, int actlen) { struct axe_softc *sc; struct axe_sframe_hdr hdr; struct axe_csum_hdr csum_hdr; int error, len, pos; sc = uether_getsc(ue); pos = 0; len = 0; error = 0; if ((sc->sc_flags & AXE_FLAG_STD_FRAME) != 0) { while (pos < actlen) { if ((int)(pos + sizeof(hdr)) > actlen) { /* too little data */ error = EINVAL; break; } usbd_copy_out(pc, pos, &hdr, sizeof(hdr)); if ((hdr.len ^ hdr.ilen) != sc->sc_lenmask) { /* we lost sync */ error = EINVAL; break; } pos += sizeof(hdr); len = le16toh(hdr.len); if (pos + len > actlen) { /* invalid length */ error = EINVAL; break; } axe_rxeof(ue, pc, pos, len, NULL); pos += len + (len % 2); } } else if ((sc->sc_flags & AXE_FLAG_CSUM_FRAME) != 0) { while (pos < actlen) { if ((int)(pos + sizeof(csum_hdr)) > actlen) { /* too little data */ error = EINVAL; break; } usbd_copy_out(pc, pos, &csum_hdr, sizeof(csum_hdr)); csum_hdr.len = le16toh(csum_hdr.len); csum_hdr.ilen = le16toh(csum_hdr.ilen); csum_hdr.cstatus = le16toh(csum_hdr.cstatus); if ((AXE_CSUM_RXBYTES(csum_hdr.len) ^ AXE_CSUM_RXBYTES(csum_hdr.ilen)) != sc->sc_lenmask) { /* we lost sync */ error = EINVAL; break; } /* * Get total transferred frame length including * checksum header. The length should be multiple * of 4. */ len = sizeof(csum_hdr) + AXE_CSUM_RXBYTES(csum_hdr.len); len = (len + 3) & ~3; if (pos + len > actlen) { /* invalid length */ error = EINVAL; break; } axe_rxeof(ue, pc, pos + sizeof(csum_hdr), AXE_CSUM_RXBYTES(csum_hdr.len), &csum_hdr); pos += len; } } else axe_rxeof(ue, pc, 0, actlen, NULL); if (error != 0) if_inc_counter(ue->ue_ifp, IFCOUNTER_IERRORS, 1); return (error); } static int axe_rxeof(struct usb_ether *ue, struct usb_page_cache *pc, unsigned int offset, unsigned int len, struct axe_csum_hdr *csum_hdr) { struct ifnet *ifp = ue->ue_ifp; struct mbuf *m; if (len < ETHER_HDR_LEN || len > MCLBYTES - ETHER_ALIGN) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (EINVAL); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (ENOMEM); } m->m_len = m->m_pkthdr.len = MCLBYTES; m_adj(m, ETHER_ALIGN); usbd_copy_out(pc, offset, mtod(m, uint8_t *), len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = m->m_len = len; if (csum_hdr != NULL && csum_hdr->cstatus & AXE_CSUM_HDR_L3_TYPE_IPV4) { if ((csum_hdr->cstatus & (AXE_CSUM_HDR_L4_CSUM_ERR | AXE_CSUM_HDR_L3_CSUM_ERR)) == 0) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; if ((csum_hdr->cstatus & AXE_CSUM_HDR_L4_TYPE_MASK) == AXE_CSUM_HDR_L4_TYPE_TCP || (csum_hdr->cstatus & AXE_CSUM_HDR_L4_TYPE_MASK) == AXE_CSUM_HDR_L4_TYPE_UDP) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } } } (void)mbufq_enqueue(&ue->ue_rxq, m); return (0); } #if ((AXE_BULK_BUF_SIZE >= 0x10000) || (AXE_BULK_BUF_SIZE < (MCLBYTES+4))) #error "Please update axe_bulk_write_callback()!" #endif static void axe_bulk_write_callback(struct usb_xfer *xfer, usb_error_t error) { struct axe_softc *sc = usbd_xfer_softc(xfer); struct axe_sframe_hdr hdr; struct ifnet *ifp = uether_getifp(&sc->sc_ue); struct usb_page_cache *pc; struct mbuf *m; int nframes, pos; switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: DPRINTFN(11, "transfer complete\n"); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: if ((sc->sc_flags & AXE_FLAG_LINK) == 0 || (ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) { /* * Don't send anything if there is no link or * controller is busy. */ return; } for (nframes = 0; nframes < 16 && !IFQ_DRV_IS_EMPTY(&ifp->if_snd); nframes++) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; usbd_xfer_set_frame_offset(xfer, nframes * MCLBYTES, nframes); pos = 0; pc = usbd_xfer_get_frame(xfer, nframes); if (AXE_IS_178_FAMILY(sc)) { hdr.len = htole16(m->m_pkthdr.len); hdr.ilen = ~hdr.len; /* * If upper stack computed checksum, driver * should tell controller not to insert * computed checksum for checksum offloading * enabled controller. */ if (ifp->if_capabilities & IFCAP_TXCSUM) { if ((m->m_pkthdr.csum_flags & AXE_CSUM_FEATURES) != 0) hdr.len |= htole16( AXE_TX_CSUM_PSEUDO_HDR); else hdr.len |= htole16( AXE_TX_CSUM_DIS); } usbd_copy_in(pc, pos, &hdr, sizeof(hdr)); pos += sizeof(hdr); usbd_m_copy_in(pc, pos, m, 0, m->m_pkthdr.len); pos += m->m_pkthdr.len; if ((pos % 512) == 0) { hdr.len = 0; hdr.ilen = 0xffff; usbd_copy_in(pc, pos, &hdr, sizeof(hdr)); pos += sizeof(hdr); } } else { usbd_m_copy_in(pc, pos, m, 0, m->m_pkthdr.len); pos += m->m_pkthdr.len; } /* * XXX * Update TX packet counter here. This is not * correct way but it seems that there is no way * to know how many packets are sent at the end * of transfer because controller combines * multiple writes into single one if there is * room in TX buffer of controller. */ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* * if there's a BPF listener, bounce a copy * of this frame to him: */ BPF_MTAP(ifp, m); m_freem(m); /* Set frame length. */ usbd_xfer_set_frame_len(xfer, nframes, pos); } if (nframes != 0) { usbd_xfer_set_frames(xfer, nframes); usbd_transfer_submit(xfer); ifp->if_drv_flags |= IFF_DRV_OACTIVE; } return; /* NOTREACHED */ default: /* Error */ DPRINTFN(11, "transfer error, %s\n", usbd_errstr(error)); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (error != USB_ERR_CANCELLED) { /* try to clear stall first */ usbd_xfer_set_stall(xfer); goto tr_setup; } return; } } static void axe_tick(struct usb_ether *ue) { struct axe_softc *sc = uether_getsc(ue); struct mii_data *mii = GET_MII(sc); AXE_LOCK_ASSERT(sc, MA_OWNED); mii_tick(mii); if ((sc->sc_flags & AXE_FLAG_LINK) == 0) { axe_miibus_statchg(ue->ue_dev); if ((sc->sc_flags & AXE_FLAG_LINK) != 0) axe_start(ue); } } static void axe_start(struct usb_ether *ue) { struct axe_softc *sc = uether_getsc(ue); /* * start the USB transfers, if not already started: */ usbd_transfer_start(sc->sc_xfer[AXE_BULK_DT_RD]); usbd_transfer_start(sc->sc_xfer[AXE_BULK_DT_WR]); } static void axe_csum_cfg(struct usb_ether *ue) { struct axe_softc *sc; struct ifnet *ifp; uint16_t csum1, csum2; sc = uether_getsc(ue); AXE_LOCK_ASSERT(sc, MA_OWNED); if ((sc->sc_flags & AXE_FLAG_772B) != 0) { ifp = uether_getifp(ue); csum1 = 0; csum2 = 0; if ((ifp->if_capenable & IFCAP_TXCSUM) != 0) csum1 |= AXE_TXCSUM_IP | AXE_TXCSUM_TCP | AXE_TXCSUM_UDP; axe_cmd(sc, AXE_772B_CMD_WRITE_TXCSUM, csum2, csum1, NULL); csum1 = 0; csum2 = 0; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) csum1 |= AXE_RXCSUM_IP | AXE_RXCSUM_IPVE | AXE_RXCSUM_TCP | AXE_RXCSUM_UDP | AXE_RXCSUM_ICMP | AXE_RXCSUM_IGMP; axe_cmd(sc, AXE_772B_CMD_WRITE_RXCSUM, csum2, csum1, NULL); } } static void axe_init(struct usb_ether *ue) { struct axe_softc *sc = uether_getsc(ue); struct ifnet *ifp = uether_getifp(ue); uint16_t rxmode; AXE_LOCK_ASSERT(sc, MA_OWNED); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) return; /* Cancel pending I/O */ axe_stop(ue); axe_reset(sc); /* Set MAC address and transmitter IPG values. */ if (AXE_IS_178_FAMILY(sc)) { axe_cmd(sc, AXE_178_CMD_WRITE_NODEID, 0, 0, IF_LLADDR(ifp)); axe_cmd(sc, AXE_178_CMD_WRITE_IPG012, sc->sc_ipgs[2], (sc->sc_ipgs[1] << 8) | (sc->sc_ipgs[0]), NULL); } else { axe_cmd(sc, AXE_172_CMD_WRITE_NODEID, 0, 0, IF_LLADDR(ifp)); axe_cmd(sc, AXE_172_CMD_WRITE_IPG0, 0, sc->sc_ipgs[0], NULL); axe_cmd(sc, AXE_172_CMD_WRITE_IPG1, 0, sc->sc_ipgs[1], NULL); axe_cmd(sc, AXE_172_CMD_WRITE_IPG2, 0, sc->sc_ipgs[2], NULL); } if (AXE_IS_178_FAMILY(sc)) { sc->sc_flags &= ~(AXE_FLAG_STD_FRAME | AXE_FLAG_CSUM_FRAME); if ((sc->sc_flags & AXE_FLAG_772B) != 0 && (ifp->if_capenable & IFCAP_RXCSUM) != 0) { sc->sc_lenmask = AXE_CSUM_HDR_LEN_MASK; sc->sc_flags |= AXE_FLAG_CSUM_FRAME; } else { sc->sc_lenmask = AXE_HDR_LEN_MASK; sc->sc_flags |= AXE_FLAG_STD_FRAME; } } /* Configure TX/RX checksum offloading. */ axe_csum_cfg(ue); if (sc->sc_flags & AXE_FLAG_772B) { /* AX88772B uses different maximum frame burst configuration. */ axe_cmd(sc, AXE_772B_CMD_RXCTL_WRITE_CFG, ax88772b_mfb_table[AX88772B_MFB_16K].threshold, ax88772b_mfb_table[AX88772B_MFB_16K].byte_cnt, NULL); } /* Enable receiver, set RX mode. */ rxmode = (AXE_RXCMD_MULTICAST | AXE_RXCMD_ENABLE); if (AXE_IS_178_FAMILY(sc)) { if (sc->sc_flags & AXE_FLAG_772B) { /* * Select RX header format type 1. Aligning IP * header on 4 byte boundary is not needed when * checksum offloading feature is not used * because we always copy the received frame in * RX handler. When RX checksum offloading is * active, aligning IP header is required to * reflect actual frame length including RX * header size. */ rxmode |= AXE_772B_RXCMD_HDR_TYPE_1; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) rxmode |= AXE_772B_RXCMD_IPHDR_ALIGN; } else { /* * Default Rx buffer size is too small to get * maximum performance. */ rxmode |= AXE_178_RXCMD_MFB_16384; } } else { rxmode |= AXE_172_RXCMD_UNICAST; } /* If we want promiscuous mode, set the allframes bit. */ if (ifp->if_flags & IFF_PROMISC) rxmode |= AXE_RXCMD_PROMISC; if (ifp->if_flags & IFF_BROADCAST) rxmode |= AXE_RXCMD_BROADCAST; axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, rxmode, NULL); /* Load the multicast filter. */ axe_setmulti(ue); usbd_xfer_set_stall(sc->sc_xfer[AXE_BULK_DT_WR]); ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Switch to selected media. */ axe_ifmedia_upd(ifp); } static void axe_setpromisc(struct usb_ether *ue) { struct axe_softc *sc = uether_getsc(ue); struct ifnet *ifp = uether_getifp(ue); uint16_t rxmode; axe_cmd(sc, AXE_CMD_RXCTL_READ, 0, 0, &rxmode); rxmode = le16toh(rxmode); if (ifp->if_flags & IFF_PROMISC) { rxmode |= AXE_RXCMD_PROMISC; } else { rxmode &= ~AXE_RXCMD_PROMISC; } axe_cmd(sc, AXE_CMD_RXCTL_WRITE, 0, rxmode, NULL); axe_setmulti(ue); } static void axe_stop(struct usb_ether *ue) { struct axe_softc *sc = uether_getsc(ue); struct ifnet *ifp = uether_getifp(ue); AXE_LOCK_ASSERT(sc, MA_OWNED); ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); sc->sc_flags &= ~AXE_FLAG_LINK; /* * stop all the transfers, if not already stopped: */ usbd_transfer_stop(sc->sc_xfer[AXE_BULK_DT_WR]); usbd_transfer_stop(sc->sc_xfer[AXE_BULK_DT_RD]); } static int axe_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct usb_ether *ue = ifp->if_softc; struct axe_softc *sc; struct ifreq *ifr; int error, mask, reinit; sc = uether_getsc(ue); ifr = (struct ifreq *)data; error = 0; reinit = 0; if (cmd == SIOCSIFCAP) { AXE_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if ((mask & IFCAP_TXCSUM) != 0 && (ifp->if_capabilities & IFCAP_TXCSUM) != 0) { ifp->if_capenable ^= IFCAP_TXCSUM; if ((ifp->if_capenable & IFCAP_TXCSUM) != 0) ifp->if_hwassist |= AXE_CSUM_FEATURES; else ifp->if_hwassist &= ~AXE_CSUM_FEATURES; reinit++; } if ((mask & IFCAP_RXCSUM) != 0 && (ifp->if_capabilities & IFCAP_RXCSUM) != 0) { ifp->if_capenable ^= IFCAP_RXCSUM; reinit++; } if (reinit > 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) ifp->if_drv_flags &= ~IFF_DRV_RUNNING; else reinit = 0; AXE_UNLOCK(sc); if (reinit > 0) uether_init(ue); } else error = uether_ioctl(ifp, cmd, data); return (error); } Index: head/sys/netinet/netdump/netdump_client.c =================================================================== --- head/sys/netinet/netdump/netdump_client.c (revision 357663) +++ head/sys/netinet/netdump/netdump_client.c (revision 357664) @@ -1,817 +1,817 @@ /*- * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved. * Copyright (c) 2000 Darrell Anderson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * netdump_client.c * FreeBSD subsystem supporting netdump network dumps. * A dedicated server must be running to accept client dumps. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NETDDEBUGV(f, ...) do { \ if (nd_debug > 1) \ printf(("%s: " f), __func__, ## __VA_ARGS__); \ } while (0) static int netdump_configure(struct diocskerneldump_arg *, struct thread *); static int netdump_dumper(void *priv __unused, void *virtual, vm_offset_t physical __unused, off_t offset, size_t length); static bool netdump_enabled(void); static int netdump_enabled_sysctl(SYSCTL_HANDLER_ARGS); static int netdump_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr, int flags __unused, struct thread *td); static int netdump_modevent(module_t mod, int type, void *priv); static int netdump_start(struct dumperinfo *di); static void netdump_unconfigure(void); /* Must be at least as big as the chunks dumpsys() gives us. */ static unsigned char nd_buf[MAXDUMPPGS * PAGE_SIZE]; static int dump_failed; /* Configuration parameters. */ static struct { char ndc_iface[IFNAMSIZ]; union kd_ip ndc_server; union kd_ip ndc_client; union kd_ip ndc_gateway; uint8_t ndc_af; /* Runtime State */ struct debugnet_pcb *nd_pcb; off_t nd_tx_off; size_t nd_buf_len; } nd_conf; #define nd_server nd_conf.ndc_server.in4 #define nd_client nd_conf.ndc_client.in4 #define nd_gateway nd_conf.ndc_gateway.in4 /* General dynamic settings. */ static struct sx nd_conf_lk; SX_SYSINIT(nd_conf, &nd_conf_lk, "netdump configuration lock"); #define NETDUMP_WLOCK() sx_xlock(&nd_conf_lk) #define NETDUMP_WUNLOCK() sx_xunlock(&nd_conf_lk) #define NETDUMP_RLOCK() sx_slock(&nd_conf_lk) #define NETDUMP_RUNLOCK() sx_sunlock(&nd_conf_lk) #define NETDUMP_ASSERT_WLOCKED() sx_assert(&nd_conf_lk, SA_XLOCKED) #define NETDUMP_ASSERT_LOCKED() sx_assert(&nd_conf_lk, SA_LOCKED) static struct ifnet *nd_ifp; static eventhandler_tag nd_detach_cookie; FEATURE(netdump, "Netdump client support"); static SYSCTL_NODE(_net, OID_AUTO, netdump, CTLFLAG_RD, NULL, "netdump parameters"); static int nd_debug; SYSCTL_INT(_net_netdump, OID_AUTO, debug, CTLFLAG_RWTUN, &nd_debug, 0, "Debug message verbosity"); SYSCTL_PROC(_net_netdump, OID_AUTO, enabled, CTLFLAG_RD | CTLTYPE_INT, NULL, 0, netdump_enabled_sysctl, "I", "netdump configuration status"); static char nd_path[MAXPATHLEN]; SYSCTL_STRING(_net_netdump, OID_AUTO, path, CTLFLAG_RW, nd_path, sizeof(nd_path), "Server path for output files"); /* * The following three variables were moved to debugnet(4), but these knobs * were retained as aliases. */ SYSCTL_INT(_net_netdump, OID_AUTO, polls, CTLFLAG_RWTUN, &debugnet_npolls, 0, "Number of times to poll before assuming packet loss (0.5ms per poll)"); SYSCTL_INT(_net_netdump, OID_AUTO, retries, CTLFLAG_RWTUN, &debugnet_nretries, 0, "Number of retransmit attempts before giving up"); SYSCTL_INT(_net_netdump, OID_AUTO, arp_retries, CTLFLAG_RWTUN, &debugnet_arp_nretries, 0, "Number of ARP attempts before giving up"); static bool nd_is_enabled; static bool netdump_enabled(void) { NETDUMP_ASSERT_LOCKED(); return (nd_is_enabled); } static void netdump_set_enabled(bool status) { NETDUMP_ASSERT_LOCKED(); nd_is_enabled = status; } static int netdump_enabled_sysctl(SYSCTL_HANDLER_ARGS) { int en, error; NETDUMP_RLOCK(); en = netdump_enabled(); NETDUMP_RUNLOCK(); error = SYSCTL_OUT(req, &en, sizeof(en)); if (error != 0 || req->newptr == NULL) return (error); return (EPERM); } /*- * Dumping specific primitives. */ /* * Flush any buffered vmcore data. */ static int netdump_flush_buf(void) { int error; error = 0; if (nd_conf.nd_buf_len != 0) { struct debugnet_proto_aux auxdata = { .dp_offset_start = nd_conf.nd_tx_off, }; error = debugnet_send(nd_conf.nd_pcb, DEBUGNET_DATA, nd_buf, nd_conf.nd_buf_len, &auxdata); if (error == 0) nd_conf.nd_buf_len = 0; } return (error); } /* * Callback from dumpsys() to dump a chunk of memory. * Copies it out to our static buffer then sends it across the network. * Detects the initial KDH and makes sure it is given a special packet type. * * Parameters: * priv Unused. Optional private pointer. * virtual Virtual address (where to read the data from) * physical Unused. Physical memory address. * offset Offset from start of core file * length Data length * * Return value: * 0 on success * errno on error */ static int netdump_dumper(void *priv __unused, void *virtual, vm_offset_t physical __unused, off_t offset, size_t length) { int error; NETDDEBUGV("netdump_dumper(NULL, %p, NULL, %ju, %zu)\n", virtual, (uintmax_t)offset, length); if (virtual == NULL) { error = netdump_flush_buf(); if (error != 0) dump_failed = 1; if (dump_failed != 0) printf("failed to dump the kernel core\n"); else if ( debugnet_sendempty(nd_conf.nd_pcb, DEBUGNET_FINISHED) != 0) printf("failed to close the transaction\n"); else printf("\nnetdump finished.\n"); debugnet_free(nd_conf.nd_pcb); nd_conf.nd_pcb = NULL; return (0); } if (length > sizeof(nd_buf)) return (ENOSPC); if (nd_conf.nd_buf_len + length > sizeof(nd_buf) || (nd_conf.nd_buf_len != 0 && nd_conf.nd_tx_off + nd_conf.nd_buf_len != offset)) { error = netdump_flush_buf(); if (error != 0) { dump_failed = 1; return (error); } nd_conf.nd_tx_off = offset; } memmove(nd_buf + nd_conf.nd_buf_len, virtual, length); nd_conf.nd_buf_len += length; return (0); } /* - * Perform any initalization needed prior to transmitting the kernel core. + * Perform any initialization needed prior to transmitting the kernel core. */ static int netdump_start(struct dumperinfo *di) { struct debugnet_conn_params dcp; struct debugnet_pcb *pcb; char buf[INET_ADDRSTRLEN]; int error; error = 0; /* Check if the dumping is allowed to continue. */ if (!netdump_enabled()) return (EINVAL); if (!KERNEL_PANICKED()) { printf( "netdump_start: netdump may only be used after a panic\n"); return (EINVAL); } memset(&dcp, 0, sizeof(dcp)); if (nd_server.s_addr == INADDR_ANY) { printf("netdump_start: can't netdump; no server IP given\n"); return (EINVAL); } /* We start dumping at offset 0. */ di->dumpoff = 0; dcp.dc_ifp = nd_ifp; dcp.dc_client = nd_client.s_addr; dcp.dc_server = nd_server.s_addr; dcp.dc_gateway = nd_gateway.s_addr; dcp.dc_herald_port = NETDUMP_PORT; dcp.dc_client_port = NETDUMP_ACKPORT; dcp.dc_herald_data = nd_path; dcp.dc_herald_datalen = (nd_path[0] == 0) ? 0 : strlen(nd_path) + 1; error = debugnet_connect(&dcp, &pcb); if (error != 0) { printf("failed to contact netdump server\n"); /* Squash debugnet to something the dumper code understands. */ return (EINVAL); } printf("netdumping to %s (%6D)\n", inet_ntoa_r(nd_server, buf), debugnet_get_gw_mac(pcb), ":"); nd_conf.nd_pcb = pcb; return (0); } static int netdump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh, void *key, uint32_t keysize) { int error; error = netdump_flush_buf(); if (error != 0) return (error); memcpy(nd_buf, kdh, sizeof(*kdh)); error = debugnet_send(nd_conf.nd_pcb, NETDUMP_KDH, nd_buf, sizeof(*kdh), NULL); if (error == 0 && keysize > 0) { if (keysize > sizeof(nd_buf)) return (EINVAL); memcpy(nd_buf, key, keysize); error = debugnet_send(nd_conf.nd_pcb, NETDUMP_EKCD_KEY, nd_buf, keysize, NULL); } return (error); } /*- * KLD specific code. */ static struct cdevsw netdump_cdevsw = { .d_version = D_VERSION, .d_ioctl = netdump_ioctl, .d_name = "netdump", }; static struct cdev *netdump_cdev; static void netdump_unconfigure(void) { struct diocskerneldump_arg kda; NETDUMP_ASSERT_WLOCKED(); KASSERT(netdump_enabled(), ("%s: not enabled", __func__)); bzero(&kda, sizeof(kda)); kda.kda_index = KDA_REMOVE_DEV; (void)dumper_remove(nd_conf.ndc_iface, &kda); if (nd_ifp != NULL) if_rele(nd_ifp); nd_ifp = NULL; netdump_set_enabled(false); log(LOG_WARNING, "netdump: Lost configured interface %s\n", nd_conf.ndc_iface); bzero(&nd_conf, sizeof(nd_conf)); } static void netdump_ifdetach(void *arg __unused, struct ifnet *ifp) { NETDUMP_WLOCK(); if (ifp == nd_ifp) netdump_unconfigure(); NETDUMP_WUNLOCK(); } /* * td of NULL is a sentinel value that indicates a kernel caller (ddb(4) or * modload-based tunable parameters). */ static int netdump_configure(struct diocskerneldump_arg *conf, struct thread *td) { struct ifnet *ifp; NETDUMP_ASSERT_WLOCKED(); if (conf->kda_iface[0] != 0) { if (td != NULL && !IS_DEFAULT_VNET(TD_TO_VNET(td))) return (EINVAL); CURVNET_SET(vnet0); ifp = ifunit_ref(conf->kda_iface); CURVNET_RESTORE(); } else ifp = NULL; if (nd_ifp != NULL) if_rele(nd_ifp); nd_ifp = ifp; netdump_set_enabled(true); #define COPY_SIZED(elm) do { \ _Static_assert(sizeof(nd_conf.ndc_ ## elm) == \ sizeof(conf->kda_ ## elm), "elm " __XSTRING(elm) " mismatch"); \ memcpy(&nd_conf.ndc_ ## elm, &conf->kda_ ## elm, \ sizeof(nd_conf.ndc_ ## elm)); \ } while (0) COPY_SIZED(iface); COPY_SIZED(server); COPY_SIZED(client); COPY_SIZED(gateway); COPY_SIZED(af); #undef COPY_SIZED return (0); } /* * ioctl(2) handler for the netdump device. This is currently only used to * register netdump as a dump device. * * Parameters: * dev, Unused. * cmd, The ioctl to be handled. * addr, The parameter for the ioctl. * flags, Unused. * td, The thread invoking this ioctl. * * Returns: * 0 on success, and an errno value on failure. */ static int netdump_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr, int flags __unused, struct thread *td) { struct diocskerneldump_arg kda_copy, *conf; struct dumperinfo dumper; uint8_t *encryptedkey; int error; #ifdef COMPAT_FREEBSD11 u_int u; #endif #ifdef COMPAT_FREEBSD12 struct diocskerneldump_arg_freebsd12 *kda12; struct netdump_conf_freebsd12 *conf12; #endif conf = NULL; error = 0; NETDUMP_WLOCK(); switch (cmd) { #ifdef COMPAT_FREEBSD11 case DIOCSKERNELDUMP_FREEBSD11: gone_in(13, "11.x ABI compatibility"); u = *(u_int *)addr; if (u != 0) { error = ENXIO; break; } if (netdump_enabled()) netdump_unconfigure(); break; #endif #ifdef COMPAT_FREEBSD12 /* * Used by dumpon(8) in 12.x for clearing previous * configuration -- then NETDUMPSCONF_FREEBSD12 is used to * actually configure netdump. */ case DIOCSKERNELDUMP_FREEBSD12: gone_in(14, "12.x ABI compatibility"); kda12 = (void *)addr; if (kda12->kda12_enable) { error = ENXIO; break; } if (netdump_enabled()) netdump_unconfigure(); break; case NETDUMPGCONF_FREEBSD12: gone_in(14, "FreeBSD 12.x ABI compat"); conf12 = (void *)addr; if (!netdump_enabled()) { error = ENXIO; break; } if (nd_conf.ndc_af != AF_INET) { error = EOPNOTSUPP; break; } if (nd_ifp != NULL) strlcpy(conf12->ndc12_iface, nd_ifp->if_xname, sizeof(conf12->ndc12_iface)); memcpy(&conf12->ndc12_server, &nd_server, sizeof(conf12->ndc12_server)); memcpy(&conf12->ndc12_client, &nd_client, sizeof(conf12->ndc12_client)); memcpy(&conf12->ndc12_gateway, &nd_gateway, sizeof(conf12->ndc12_gateway)); break; #endif case DIOCGKERNELDUMP: conf = (void *)addr; /* * For now, index is ignored; netdump doesn't support multiple * configurations (yet). */ if (!netdump_enabled()) { error = ENXIO; conf = NULL; break; } if (nd_ifp != NULL) strlcpy(conf->kda_iface, nd_ifp->if_xname, sizeof(conf->kda_iface)); memcpy(&conf->kda_server, &nd_server, sizeof(nd_server)); memcpy(&conf->kda_client, &nd_client, sizeof(nd_client)); memcpy(&conf->kda_gateway, &nd_gateway, sizeof(nd_gateway)); conf->kda_af = nd_conf.ndc_af; conf = NULL; break; #ifdef COMPAT_FREEBSD12 case NETDUMPSCONF_FREEBSD12: gone_in(14, "FreeBSD 12.x ABI compat"); conf12 = (struct netdump_conf_freebsd12 *)addr; _Static_assert(offsetof(struct diocskerneldump_arg, kda_server) == offsetof(struct netdump_conf_freebsd12, ndc12_server), "simplifying assumption"); memset(&kda_copy, 0, sizeof(kda_copy)); memcpy(&kda_copy, conf12, offsetof(struct diocskerneldump_arg, kda_server)); /* 12.x ABI could only configure IPv4 (INET) netdump. */ kda_copy.kda_af = AF_INET; memcpy(&kda_copy.kda_server.in4, &conf12->ndc12_server, sizeof(struct in_addr)); memcpy(&kda_copy.kda_client.in4, &conf12->ndc12_client, sizeof(struct in_addr)); memcpy(&kda_copy.kda_gateway.in4, &conf12->ndc12_gateway, sizeof(struct in_addr)); kda_copy.kda_index = (conf12->ndc12_kda.kda12_enable ? 0 : KDA_REMOVE_ALL); conf = &kda_copy; explicit_bzero(conf12, sizeof(*conf12)); /* FALLTHROUGH */ #endif case DIOCSKERNELDUMP: encryptedkey = NULL; if (cmd == DIOCSKERNELDUMP) { conf = (void *)addr; memcpy(&kda_copy, conf, sizeof(kda_copy)); } /* Netdump only supports IP4 at this time. */ if (conf->kda_af != AF_INET) { error = EPROTONOSUPPORT; break; } conf->kda_iface[sizeof(conf->kda_iface) - 1] = '\0'; if (conf->kda_index == KDA_REMOVE || conf->kda_index == KDA_REMOVE_DEV || conf->kda_index == KDA_REMOVE_ALL) { if (netdump_enabled()) netdump_unconfigure(); if (conf->kda_index == KDA_REMOVE_ALL) error = dumper_remove(NULL, conf); break; } error = netdump_configure(conf, td); if (error != 0) break; if (conf->kda_encryption != KERNELDUMP_ENC_NONE) { if (conf->kda_encryptedkeysize <= 0 || conf->kda_encryptedkeysize > KERNELDUMP_ENCKEY_MAX_SIZE) { error = EINVAL; break; } encryptedkey = malloc(conf->kda_encryptedkeysize, M_TEMP, M_WAITOK); error = copyin(conf->kda_encryptedkey, encryptedkey, conf->kda_encryptedkeysize); if (error != 0) { free(encryptedkey, M_TEMP); break; } conf->kda_encryptedkey = encryptedkey; } memset(&dumper, 0, sizeof(dumper)); dumper.dumper_start = netdump_start; dumper.dumper_hdr = netdump_write_headers; dumper.dumper = netdump_dumper; dumper.priv = NULL; dumper.blocksize = NETDUMP_DATASIZE; dumper.maxiosize = MAXDUMPPGS * PAGE_SIZE; dumper.mediaoffset = 0; dumper.mediasize = 0; error = dumper_insert(&dumper, conf->kda_iface, conf); if (encryptedkey != NULL) { explicit_bzero(encryptedkey, conf->kda_encryptedkeysize); free(encryptedkey, M_TEMP); } if (error != 0) netdump_unconfigure(); break; default: error = ENOTTY; break; } explicit_bzero(&kda_copy, sizeof(kda_copy)); if (conf != NULL) explicit_bzero(conf, sizeof(*conf)); NETDUMP_WUNLOCK(); return (error); } /* * Called upon system init or kld load. Initializes the netdump parameters to * sane defaults (locates the first available NIC and uses the first IPv4 IP on * that card as the client IP). Leaves the server IP unconfigured. * * Parameters: * mod, Unused. * what, The module event type. * priv, Unused. * * Returns: * int, An errno value if an error occured, 0 otherwise. */ static int netdump_modevent(module_t mod __unused, int what, void *priv __unused) { struct diocskerneldump_arg conf; char *arg; int error; error = 0; switch (what) { case MOD_LOAD: error = make_dev_p(MAKEDEV_WAITOK, &netdump_cdev, &netdump_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "netdump"); if (error != 0) return (error); nd_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event, netdump_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if ((arg = kern_getenv("net.dump.iface")) != NULL) { strlcpy(conf.kda_iface, arg, sizeof(conf.kda_iface)); freeenv(arg); if ((arg = kern_getenv("net.dump.server")) != NULL) { inet_aton(arg, &conf.kda_server.in4); freeenv(arg); } if ((arg = kern_getenv("net.dump.client")) != NULL) { inet_aton(arg, &conf.kda_client.in4); freeenv(arg); } if ((arg = kern_getenv("net.dump.gateway")) != NULL) { inet_aton(arg, &conf.kda_gateway.in4); freeenv(arg); } conf.kda_af = AF_INET; /* Ignore errors; we print a message to the console. */ NETDUMP_WLOCK(); (void)netdump_configure(&conf, NULL); NETDUMP_WUNLOCK(); } break; case MOD_UNLOAD: NETDUMP_WLOCK(); if (netdump_enabled()) { printf("netdump: disabling dump device for unload\n"); netdump_unconfigure(); } NETDUMP_WUNLOCK(); destroy_dev(netdump_cdev); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nd_detach_cookie); break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t netdump_mod = { "netdump", netdump_modevent, NULL, }; MODULE_VERSION(netdump, 1); DECLARE_MODULE(netdump, netdump_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); #ifdef DDB /* * Usage: netdump -s [-g -i * * Order is not significant. * * Currently, this command does not support configuring encryption or * compression. */ DB_FUNC(netdump, db_netdump_cmd, db_cmd_table, CS_OWN, NULL) { static struct diocskerneldump_arg conf; static char blockbuf[NETDUMP_DATASIZE]; static union { struct dumperinfo di; /* For valid di_devname. */ char di_buf[sizeof(struct dumperinfo) + 1]; } u; struct debugnet_ddb_config params; int error; error = debugnet_parse_ddb_cmd("netdump", ¶ms); if (error != 0) { db_printf("Error configuring netdump: %d\n", error); return; } /* Translate to a netdump dumper config. */ memset(&conf, 0, sizeof(conf)); if (params.dd_ifp != NULL) strlcpy(conf.kda_iface, if_name(params.dd_ifp), sizeof(conf.kda_iface)); conf.kda_af = AF_INET; conf.kda_server.in4 = (struct in_addr) { params.dd_server }; if (params.dd_has_client) conf.kda_client.in4 = (struct in_addr) { params.dd_client }; else conf.kda_client.in4 = (struct in_addr) { INADDR_ANY }; if (params.dd_has_gateway) conf.kda_gateway.in4 = (struct in_addr) { params.dd_gateway }; else conf.kda_gateway.in4 = (struct in_addr) { INADDR_ANY }; /* Set the global netdump config to these options. */ error = netdump_configure(&conf, NULL); if (error != 0) { db_printf("Error enabling netdump: %d\n", error); return; } /* Fake the generic dump configuration list entry to avoid malloc. */ memset(&u.di_buf, 0, sizeof(u.di_buf)); u.di.dumper_start = netdump_start; u.di.dumper_hdr = netdump_write_headers; u.di.dumper = netdump_dumper; u.di.priv = NULL; u.di.blocksize = NETDUMP_DATASIZE; u.di.maxiosize = MAXDUMPPGS * PAGE_SIZE; u.di.mediaoffset = 0; u.di.mediasize = 0; u.di.blockbuf = blockbuf; dumper_ddb_insert(&u.di); error = doadump(false); dumper_ddb_remove(&u.di); if (error != 0) db_printf("Cannot dump: %d\n", error); } #endif /* DDB */ Index: head/sys/netinet/tcp_sack.c =================================================================== --- head/sys/netinet/tcp_sack.c (revision 357663) +++ head/sys/netinet/tcp_sack.c (revision 357664) @@ -1,886 +1,886 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 */ /*- * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #include VNET_DECLARE(struct uma_zone *, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); VNET_DEFINE(int, tcp_do_sack) = 1; #define V_tcp_do_sack VNET(tcp_do_sack) SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support"); VNET_DEFINE(int, tcp_sack_maxholes) = 128; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_maxholes), 0, "Maximum number of TCP SACK holes allowed per connection"); VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_globalmaxholes), 0, "Global maximum number of TCP SACK holes"); VNET_DEFINE(int, tcp_sack_globalholes) = 0; SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_sack_globalholes), 0, "Global number of TCP SACK holes currently allocated"); /* * This function will find overlaps with the currently stored sackblocks * and add any overlap as a dsack block upfront */ void tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS]; int i, j, n, identical; tcp_seq start, end; INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); if (SEQ_LT(rcv_end, tp->rcv_nxt) || ((rcv_end == tp->rcv_nxt) && (tp->rcv_numsacks > 0 ) && (tp->sackblks[0].end == tp->rcv_nxt))) { saved_blks[0].start = rcv_start; saved_blks[0].end = rcv_end; } else { saved_blks[0].start = saved_blks[0].end = 0; } head_blk.start = head_blk.end = 0; mid_blk.start = rcv_start; mid_blk.end = rcv_end; identical = 0; for (i = 0; i < tp->rcv_numsacks; i++) { start = tp->sackblks[i].start; end = tp->sackblks[i].end; if (SEQ_LT(rcv_end, start)) { /* pkt left to sack blk */ continue; } if (SEQ_GT(rcv_start, end)) { /* pkt right to sack blk */ continue; } if (SEQ_GT(tp->rcv_nxt, end)) { if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) && (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) || (head_blk.start == head_blk.end))) { head_blk.start = SEQ_MAX(rcv_start, start); head_blk.end = SEQ_MIN(rcv_end, end); } continue; } if (((head_blk.start == head_blk.end) || SEQ_LT(start, head_blk.start)) && (SEQ_GT(end, rcv_start) && SEQ_LEQ(start, rcv_end))) { head_blk.start = start; head_blk.end = end; } mid_blk.start = SEQ_MIN(mid_blk.start, start); mid_blk.end = SEQ_MAX(mid_blk.end, end); if ((mid_blk.start == start) && (mid_blk.end == end)) identical = 1; } if (SEQ_LT(head_blk.start, head_blk.end)) { /* store overlapping range */ saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start); saved_blks[0].end = SEQ_MIN(rcv_end, head_blk.end); } n = 1; /* * Second, if not ACKed, store the SACK block that * overlaps with the DSACK block unless it is identical */ if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) && !((mid_blk.start == saved_blks[0].start) && (mid_blk.end == saved_blks[0].end))) || identical == 1) { saved_blks[n].start = mid_blk.start; saved_blks[n++].end = mid_blk.end; } for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) { if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) || SEQ_GT(tp->sackblks[j].start, mid_blk.end)) && (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt)))) saved_blks[n++] = tp->sackblks[j]; } j = 0; for (i = 0; i < n; i++) { - /* we can end up with a stale inital entry */ + /* we can end up with a stale initial entry */ if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) { tp->sackblks[j++] = saved_blks[i]; } } tp->rcv_numsacks = j; } /* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end")); if ((rcv_start == rcv_end) && (tp->rcv_numsacks >= 1) && (rcv_end == tp->sackblks[0].end)) { /* retaining DSACK block below rcv_nxt (todrop) */ head_blk = tp->sackblks[0]; } else { /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; } /* * Merge updated SACK blocks into head_blk, and save unchanged SACK * blocks into saved_blks[]. num_saved will have the number of the * saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ /* * |-| * |---| merge * * |-| * |---| merge * * |-----| * |-| DSACK smaller * * |-| * |-----| DSACK smaller */ if (head_blk.start == end) head_blk.start = start; else if (head_blk.end == start) head_blk.end = end; else { if (SEQ_LT(head_blk.start, start)) { tcp_seq temp = start; start = head_blk.start; head_blk.start = temp; } if (SEQ_GT(head_blk.end, end)) { tcp_seq temp = end; end = head_blk.end; head_blk.end = temp; } if ((head_blk.start != start) || (head_blk.end != end)) { if ((num_saved >= 1) && SEQ_GEQ(saved_blks[num_saved-1].start, start) && SEQ_LEQ(saved_blks[num_saved-1].end, end)) num_saved--; saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } } else { /* * This block supercedes the prior block */ if ((num_saved >= 1) && SEQ_GEQ(saved_blks[num_saved-1].start, start) && SEQ_LEQ(saved_blks[num_saved-1].end, end)) num_saved--; /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_LT(rcv_start, rcv_end)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if ((rcv_start == rcv_end) && (rcv_start == tp->sackblks[0].end)) { num_head = 1; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; } void tcp_clean_dsack_blocks(struct tcpcb *tp) { struct sackblk saved_blks[MAX_SACK_BLKS]; int num_saved, i; INP_WLOCK_ASSERT(tp->t_inpcb); /* * Clean up any DSACK blocks that * are in our queue of sack blocks. * */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this D-SACK block. */ continue; } /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[0], sizeof(struct sackblk) * num_saved); } tp->rcv_numsacks = num_saved; } /* * Delete all receiver-side SACK information. */ void tcp_clean_sackreport(struct tcpcb *tp) { int i; INP_WLOCK_ASSERT(tp->t_inpcb); tp->rcv_numsacks = 0; for (i = 0; i < MAX_SACK_BLKS; i++) tp->sackblks[i].start = tp->sackblks[i].end=0; } /* * Allocate struct sackhole. */ static struct sackhole * tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) { struct sackhole *hole; if (tp->snd_numholes >= V_tcp_sack_maxholes || V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { TCPSTAT_INC(tcps_sack_sboverflow); return NULL; } hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT); if (hole == NULL) return NULL; hole->start = start; hole->end = end; hole->rxmit = start; tp->snd_numholes++; atomic_add_int(&V_tcp_sack_globalholes, 1); return hole; } /* * Free struct sackhole. */ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { uma_zfree(V_sack_hole_zone, hole); tp->snd_numholes--; atomic_subtract_int(&V_tcp_sack_globalholes, 1); KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); } /* * Insert new SACK hole into scoreboard. */ static struct sackhole * tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, struct sackhole *after) { struct sackhole *hole; /* Allocate a new SACK hole. */ hole = tcp_sackhole_alloc(tp, start, end); if (hole == NULL) return NULL; /* Insert the new SACK hole into scoreboard. */ if (after != NULL) TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); else TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); /* Update SACK hint. */ if (tp->sackhint.nexthole == NULL) tp->sackhint.nexthole = hole; return hole; } /* * Remove SACK hole from scoreboard. */ static void tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) { /* Update SACK hint. */ if (tp->sackhint.nexthole == hole) tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); /* Remove this SACK hole. */ TAILQ_REMOVE(&tp->snd_holes, hole, scblink); /* Free this SACK hole. */ tcp_sackhole_free(tp, hole); } /* * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). * Returns 1 if incoming ACK has previously unknown SACK information, * 0 otherwise. Note: We treat (snd_una, th_ack) as a sack block so any changes * to that (i.e. left edge moving) would also be considered a change in SACK * information which is slightly different than rfc6675. */ int tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks, sack_changed; INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; sack_changed = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { tp->sackhint.sacked_bytes = 0; /* reset */ for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, tp->snd_una) && SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) { sack_blocks[num_sack_blks++] = sack; tp->sackhint.sacked_bytes += (sack.end-sack.start); } } } /* * Return if SND.UNA is not advanced and no valid SACK block is * received. */ if (num_sack_blks == 0) return (sack_changed); /* * Sort the SACK blocks so we can update the scoreboard with just one * pass. The overhead of sorting up to 4+1 elements is less than * making up to 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } if (TAILQ_EMPTY(&tp->snd_holes)) /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes * (from the sack blocks received) are created later below * (in the logic that adds holes to the tail of the * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; sack_changed = 1; } else { /* * We failed to add a new hole based on the current * sack block. Skip over all the sack blocks that * fall completely to the right of snd_fack and * proceed to trim the scoreboard based on the * remaining sack blocks. This also trims the * scoreboard for th_ack (which is sack_blocks[0]). */ while (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->start)) sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) { /* fack is advanced. */ tp->snd_fack = sblkp->end; sack_changed = 1; } cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* * Since the incoming sack blocks are sorted, we can process them * making one sweep of the scoreboard. */ while (sblkp >= sack_blocks && cur != NULL) { if (SEQ_GEQ(sblkp->start, cur->end)) { /* * SACKs data beyond the current hole. Go to the * previous sack block. */ sblkp--; continue; } if (SEQ_LEQ(sblkp->end, cur->start)) { /* * SACKs data before the current hole. Go to the * previous hole. */ cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); sack_changed = 1; if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); /* * The sack block may ack all or part of the * next hole too, so continue onto the next * hole. */ continue; } else { /* Move start of hole forward. */ cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } } else { /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { /* * ACKs some data in middle of a hole; need * to split current hole */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; tp->sackhint.sack_bytes_rexmit += (temp->rxmit - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } } } tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. * Accordingly, we advance one or the other. */ if (SEQ_LEQ(sblkp->start, cur->start)) cur = TAILQ_PREV(cur, sackhole_head, scblink); else sblkp--; } return (sack_changed); } /* * Free all SACK holes to clear the scoreboard. */ void tcp_free_sackholes(struct tcpcb *tp) { struct sackhole *q; INP_WLOCK_ASSERT(tp->t_inpcb); while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) tcp_sackhole_remove(tp, q); tp->sackhint.sack_bytes_rexmit = 0; KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); KASSERT(tp->sackhint.nexthole == NULL, ("tp->sackhint.nexthole == NULL")); } /* * Partial ack handling within a sack recovery episode. Keeping this very * simple for now. When a partial ack is received, force snd_cwnd to a value * that will allow the sender to transmit no more than 2 segments. If * necessary, a better scheme can be adopted at a later point, but for now, * the goal is to prevent the sender from bursting a large amount of data in * the midst of sack recovery. */ void tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) { int num_segs = 1; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); } #if 0 /* * Debug version of tcp_sack_output() that walks the scoreboard. Used for * now to sanity check the hint. */ static struct sackhole * tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *p; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = 0; TAILQ_FOREACH(p, &tp->snd_holes, scblink) { if (SEQ_LT(p->rxmit, p->end)) { if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ continue; } *sack_bytes_rexmt += (p->rxmit - p->start); break; } *sack_bytes_rexmt += (p->rxmit - p->start); } return (p); } #endif /* * Returns the next hole to retransmit and the number of retransmitted bytes * from the scoreboard. We store both the next hole and the number of * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK * reception). This avoids scoreboard traversals completely. * * The loop here will traverse *at most* one link. Here's the argument. For * the loop to traverse more than 1 link before finding the next hole to * retransmit, we would need to have at least 1 node following the current * hint with (rxmit == end). But, for all holes following the current hint, * (start == rxmit), since we have not yet retransmitted from them. * Therefore, in order to traverse more 1 link in the loop below, we need to * have at least one node following the current hint with (start == rxmit == * end). But that can't happen, (start == end) means that all the data in * that hole has been sacked, in which case, the hole would have been removed * from the scoreboard. */ struct sackhole * tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *hole = NULL; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; hole = tp->sackhint.nexthole; if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) goto out; while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { if (SEQ_LT(hole->rxmit, hole->end)) { tp->sackhint.nexthole = hole; break; } } out: return (hole); } /* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); INP_WLOCK_ASSERT(tp->t_inpcb); if (cur == NULL) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) return; /* We're already beyond any SACKed blocks */ /*- * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, p->start)) cur = p; else { tp->snd_nxt = p->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->snd_fack; } Index: head/sys/netinet/tcp_stacks/bbr.c =================================================================== --- head/sys/netinet/tcp_stacks/bbr.c (revision 357663) +++ head/sys/netinet/tcp_stacks/bbr.c (revision 357664) @@ -1,15199 +1,15199 @@ /*- * Copyright (c) 2016-9 * Netflix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /** * Author: Randall Stewart * This work is based on the ACM Queue paper * BBR - Congestion Based Congestion Control * and also numerous discussions with Neal, Yuchung and Van. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" #include "opt_kern_tls.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #ifdef KERN_TLS #include #endif #include #include #ifdef STATS #include #include #include /* Must come after qmath.h and tree.h */ #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #include #include #include #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "sack_filter.h" #include "tcp_bbr.h" #include "rack_bbr_common.h" uma_zone_t bbr_zone; uma_zone_t bbr_pcb_zone; struct sysctl_ctx_list bbr_sysctl_ctx; struct sysctl_oid *bbr_sysctl_root; #define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \ (tv) = (value); \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } while(0) /*#define BBR_INVARIANT 1*/ /* * initial window */ static uint32_t bbr_def_init_win = 10; static int32_t bbr_persist_min = 250000; /* 250ms */ static int32_t bbr_persist_max = 1000000; /* 1 Second */ static int32_t bbr_cwnd_may_shrink = 0; static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP; static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT; static int32_t bbr_hardware_pacing_limit = 8000; static int32_t bbr_quanta = 3; /* How much extra quanta do we get? */ static int32_t bbr_no_retran = 0; static int32_t bbr_error_base_paceout = 10000; /* usec to pace */ static int32_t bbr_max_net_error_cnt = 10; /* Should the following be dynamic too -- loss wise */ static int32_t bbr_rtt_gain_thresh = 0; /* Measurement controls */ static int32_t bbr_use_google_algo = 1; static int32_t bbr_ts_limiting = 1; static int32_t bbr_ts_can_raise = 0; static int32_t bbr_do_red = 600; static int32_t bbr_red_scale = 20000; static int32_t bbr_red_mul = 1; static int32_t bbr_red_div = 2; static int32_t bbr_red_growth_restrict = 1; static int32_t bbr_target_is_bbunit = 0; static int32_t bbr_drop_limit = 0; /* * How much gain do we need to see to * stay in startup? */ static int32_t bbr_marks_rxt_sack_passed = 0; static int32_t bbr_start_exit = 25; static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */ static int32_t bbr_startup_loss_thresh = 2000; /* 20.00% loss */ static int32_t bbr_hptsi_max_mul = 1; /* These two mul/div assure a min pacing */ static int32_t bbr_hptsi_max_div = 2; /* time, 0 means turned off. We need this * if we go back ever to where the pacer * has priority over timers. */ static int32_t bbr_policer_call_from_rack_to = 0; static int32_t bbr_policer_detection_enabled = 1; static int32_t bbr_min_measurements_req = 1; /* We need at least 2 * measurments before we are * "good" note that 2 == 1. * This is because we use a > * comparison. This means if * min_measure was 0, it takes * num-measures > min(0) and * you get 1 measurement and * you are good. Set to 1, you * have to have two * measurements (this is done * to prevent it from being ok * to have no measurements). */ static int32_t bbr_no_pacing_until = 4; static int32_t bbr_min_usec_delta = 20000; /* 20,000 usecs */ static int32_t bbr_min_peer_delta = 20; /* 20 units */ static int32_t bbr_delta_percent = 150; /* 15.0 % */ static int32_t bbr_target_cwnd_mult_limit = 8; /* * bbr_cwnd_min_val is the number of * segments we hold to in the RTT probe * state typically 4. */ static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS; static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS; static int32_t bbr_gain_to_target = 1; static int32_t bbr_gain_gets_extra_too = 1; /* * bbr_high_gain is the 2/ln(2) value we need * to double the sending rate in startup. This * is used for both cwnd and hptsi gain's. */ static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1; static int32_t bbr_use_lower_gain_in_startup = 1; /* thresholds for reduction on drain in sub-states/drain */ static int32_t bbr_drain_rtt = BBR_SRTT; static int32_t bbr_drain_floor = 88; static int32_t google_allow_early_out = 1; static int32_t google_consider_lost = 1; static int32_t bbr_drain_drop_mul = 4; static int32_t bbr_drain_drop_div = 5; static int32_t bbr_rand_ot = 50; static int32_t bbr_can_force_probertt = 0; static int32_t bbr_can_adjust_probertt = 1; static int32_t bbr_probertt_sets_rtt = 0; static int32_t bbr_can_use_ts_for_rtt = 1; static int32_t bbr_is_ratio = 0; static int32_t bbr_sub_drain_app_limit = 1; static int32_t bbr_prtt_slam_cwnd = 1; static int32_t bbr_sub_drain_slam_cwnd = 1; static int32_t bbr_slam_cwnd_in_main_drain = 1; static int32_t bbr_filter_len_sec = 6; /* How long does the rttProp filter * hold */ static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4); /* * bbr_drain_gain is the reverse of the high_gain * designed to drain back out the standing queue * that is formed in startup by causing a larger * hptsi gain and thus drainging the packets * in flight. */ static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885; static int32_t bbr_rttprobe_gain = 192; /* * The cwnd_gain is the default cwnd gain applied when * calculating a target cwnd. Note that the cwnd is * a secondary factor in the way BBR works (see the * paper and think about it, it will take some time). * Basically the hptsi_gain spreads the packets out * so you never get more than BDP to the peer even * if the cwnd is high. In our implemenation that * means in non-recovery/retransmission scenarios * cwnd will never be reached by the flight-size. */ static int32_t bbr_cwnd_gain = BBR_UNIT * 2; static int32_t bbr_tlp_type_to_use = BBR_SRTT; static int32_t bbr_delack_time = 100000; /* 100ms in useconds */ static int32_t bbr_sack_not_required = 0; /* set to one to allow non-sack to use bbr */ static int32_t bbr_initial_bw_bps = 62500; /* 500kbps in bytes ps */ static int32_t bbr_ignore_data_after_close = 1; static int16_t bbr_hptsi_gain[] = { (BBR_UNIT *5 / 4), (BBR_UNIT * 3 / 4), BBR_UNIT, BBR_UNIT, BBR_UNIT, BBR_UNIT, BBR_UNIT, BBR_UNIT }; int32_t bbr_use_rack_resend_cheat = 1; int32_t bbr_sends_full_iwnd = 1; #define BBR_HPTSI_GAIN_MAX 8 /* * The BBR module incorporates a number of * TCP ideas that have been put out into the IETF * over the last few years: * - Yuchung Cheng's RACK TCP (for which its named) that * will stop us using the number of dup acks and instead * use time as the gage of when we retransmit. * - Reorder Detection of RFC4737 and the Tail-Loss probe draft * of Dukkipati et.al. * - Van Jacobson's et.al BBR. * * RACK depends on SACK, so if an endpoint arrives that * cannot do SACK the state machine below will shuttle the * connection back to using the "default" TCP stack that is * in FreeBSD. * * To implement BBR and RACK the original TCP stack was first decomposed * into a functional state machine with individual states * for each of the possible TCP connection states. The do_segement * functions role in life is to mandate the connection supports SACK * initially and then assure that the RACK state matches the conenction * state before calling the states do_segment function. Data processing * of inbound segments also now happens in the hpts_do_segment in general * with only one exception. This is so we can keep the connection on * a single CPU. * * Each state is simplified due to the fact that the original do_segment * has been decomposed and we *know* what state we are in (no * switches on the state) and all tests for SACK are gone. This * greatly simplifies what each state does. * * TCP output is also over-written with a new version since it * must maintain the new rack scoreboard and has had hptsi * integrated as a requirment. Still todo is to eliminate the * use of the callout_() system and use the hpts for all * timers as well. */ static uint32_t bbr_rtt_probe_time = 200000; /* 200ms in micro seconds */ static uint32_t bbr_rtt_probe_cwndtarg = 4; /* How many mss's outstanding */ static const int32_t bbr_min_req_free = 2; /* The min we must have on the * free list */ static int32_t bbr_tlp_thresh = 1; static int32_t bbr_reorder_thresh = 2; static int32_t bbr_reorder_fade = 60000000; /* 0 - never fade, def * 60,000,000 - 60 seconds */ static int32_t bbr_pkt_delay = 1000; static int32_t bbr_min_to = 1000; /* Number of usec's minimum timeout */ static int32_t bbr_incr_timers = 1; static int32_t bbr_tlp_min = 10000; /* 10ms in usecs */ static int32_t bbr_delayed_ack_time = 200000; /* 200ms in usecs */ static int32_t bbr_exit_startup_at_loss = 1; /* * bbr_lt_bw_ratio is 1/8th * bbr_lt_bw_diff is < 4 Kbit/sec */ static uint64_t bbr_lt_bw_diff = 4000 / 8; /* In bytes per second */ static uint64_t bbr_lt_bw_ratio = 8; /* For 1/8th */ static uint32_t bbr_lt_bw_max_rtts = 48; /* How many rtt's do we use * the lt_bw for */ static uint32_t bbr_lt_intvl_min_rtts = 4; /* Min num of RTT's to measure * lt_bw */ static int32_t bbr_lt_intvl_fp = 0; /* False positive epoch diff */ static int32_t bbr_lt_loss_thresh = 196; /* Lost vs delivered % */ static int32_t bbr_lt_fd_thresh = 100; /* false detection % */ static int32_t bbr_verbose_logging = 0; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up * being a total of 122.850 seconds before a * connection is killed. */ static int32_t bbr_rto_min_ms = 30; /* 30ms same as main freebsd */ static int32_t bbr_rto_max_sec = 4; /* 4 seconds */ /****************************************************/ /* DEFAULT TSO SIZING (cpu performance impacting) */ /****************************************************/ /* What amount is our formula using to get TSO size */ static int32_t bbr_hptsi_per_second = 1000; /* * For hptsi under bbr_cross_over connections what is delay * target 7ms (in usec) combined with a seg_max of 2 * gets us close to identical google behavior in * TSO size selection (possibly more 1MSS sends). */ static int32_t bbr_hptsi_segments_delay_tar = 7000; /* Does pacing delay include overhead's in its time calculations? */ static int32_t bbr_include_enet_oh = 0; static int32_t bbr_include_ip_oh = 1; static int32_t bbr_include_tcp_oh = 1; static int32_t bbr_google_discount = 10; /* Do we use (nf mode) pkt-epoch to drive us or rttProp? */ static int32_t bbr_state_is_pkt_epoch = 0; static int32_t bbr_state_drain_2_tar = 1; /* What is the max the 0 - bbr_cross_over MBPS TSO target * can reach using our delay target. Note that this * value becomes the floor for the cross over * algorithm. */ static int32_t bbr_hptsi_segments_max = 2; static int32_t bbr_hptsi_segments_floor = 1; static int32_t bbr_hptsi_utter_max = 0; /* What is the min the 0 - bbr_cross-over MBPS TSO target can be */ static int32_t bbr_hptsi_bytes_min = 1460; static int32_t bbr_all_get_min = 0; /* Cross over point from algo-a to algo-b */ static uint32_t bbr_cross_over = TWENTY_THREE_MBPS; /* Do we deal with our restart state? */ static int32_t bbr_uses_idle_restart = 0; static int32_t bbr_idle_restart_threshold = 100000; /* 100ms in useconds */ /* Do we allow hardware pacing? */ static int32_t bbr_allow_hdwr_pacing = 0; static int32_t bbr_hdwr_pace_adjust = 2; /* multipler when we calc the tso size */ static int32_t bbr_hdwr_pace_floor = 1; static int32_t bbr_hdwr_pacing_delay_cnt = 10; /****************************************************/ static int32_t bbr_resends_use_tso = 0; static int32_t bbr_tlp_max_resend = 2; static int32_t bbr_sack_block_limit = 128; #define BBR_MAX_STAT 19 counter_u64_t bbr_state_time[BBR_MAX_STAT]; counter_u64_t bbr_state_lost[BBR_MAX_STAT]; counter_u64_t bbr_state_resend[BBR_MAX_STAT]; counter_u64_t bbr_stat_arry[BBR_STAT_SIZE]; counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE]; counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t bbr_flows_whdwr_pacing; counter_u64_t bbr_flows_nohdwr_pacing; counter_u64_t bbr_nohdwr_pacing_enobuf; counter_u64_t bbr_hdwr_pacing_enobuf; static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr); /* * Static defintions we need for forward declarations. */ static uint32_t bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw); static uint32_t bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain); static void bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win); static void bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses); static void bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line, int dolog); static uint32_t bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain); static void bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses); static uint32_t bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm); static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp); static uint32_t bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t srtt, uint32_t cts); static void bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line); static void bbr_set_state_target(struct tcp_bbr *bbr, int line); static void bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line); static void bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line); static void tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts); static void bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts); static void bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt, uint32_t line, uint8_t is_start, uint16_t set); static struct bbr_sendmap * bbr_find_lowest_rsm(struct tcp_bbr *bbr); static __inline uint32_t bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type); static void bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which); static void bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt, uint32_t thresh, uint32_t to); static void bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag); static void bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay); static void bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line); static void bbr_stop_all_timers(struct tcpcb *tp); static void bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts); static void bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts); static void bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts); static void bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod); static inline uint8_t bbr_state_val(struct tcp_bbr *bbr) { return(bbr->rc_bbr_substate); } static inline uint32_t get_min_cwnd(struct tcp_bbr *bbr) { int mss; mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED) return (bbr_cwnd_min_val_hs * mss); else return (bbr_cwnd_min_val * mss); } static uint32_t bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr) { uint64_t srtt, var; uint64_t ret_val; bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; if (tp->t_srtt == 0) { srtt = (uint64_t)BBR_INITIAL_RTO; var = 0; } else { srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT); } TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]), bbr_persist_min, bbr_persist_max); return ((uint32_t)ret_val); } static uint32_t bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * Start the FR timer, we do this based on getting the first one in * the rc_tmap. Note that if its NULL we must stop the timer. in all * events we need to stop the running timer (if its running) before * starting the new one. */ uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; int32_t idx; int32_t is_tlp_timer = 0; struct bbr_sendmap *rsm; if (bbr->rc_all_timers_stopped) { /* All timers have been stopped none are to run */ return (0); } if (bbr->rc_in_persist) { /* We can't start any timer in persists */ return (bbr_get_persists_timer_val(tp, bbr)); } rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if ((rsm == NULL) || ((tp->t_flags & TF_SACK_PERMIT) == 0) || (tp->t_state < TCPS_ESTABLISHED)) { /* Nothing on the send map */ activate_rxt: if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { uint64_t tov; time_since_sent = 0; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (rsm) { idx = rsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; if (TSTMP_GT(tstmp_touse, cts)) time_since_sent = cts - tstmp_touse; } bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; if (tp->t_srtt == 0) tov = BBR_INITIAL_RTO; else tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) + ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT); if (tp->t_rxtshift) tov *= tcp_backoff[tp->t_rxtshift]; if (tov > time_since_sent) tov -= time_since_sent; else tov = bbr->r_ctl.rc_min_to; TCPT_RANGESET_NOSLOP(to, tov, (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC), (bbr->rc_max_rto_sec * USECS_IN_SECOND)); bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to); return (to); } return (0); } if (rsm->r_flags & BBR_ACKED) { rsm = bbr_find_lowest_rsm(bbr); if (rsm == NULL) { /* No lowest? */ goto activate_rxt; } } /* Convert from ms to usecs */ if (rsm->r_flags & BBR_SACK_PASSED) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & BBR_HAS_FIN)) { /* * We don't start a bbr rack timer if all we have is * a FIN outstanding. */ goto activate_rxt; } srtt = bbr_get_rtt(bbr, BBR_RTT_RACK); thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm); idx = rsm->r_rtr_cnt - 1; exp = rsm->r_tim_lastsent[idx] + thresh; if (SEQ_GEQ(exp, cts)) { to = exp - cts; if (to < bbr->r_ctl.rc_min_to) { to = bbr->r_ctl.rc_min_to; } } else { to = bbr->r_ctl.rc_min_to; } } else { /* Ok we need to do a TLP not RACK */ if (bbr->rc_tlp_in_progress != 0) { /* * The previous send was a TLP. */ goto activate_rxt; } rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ goto activate_rxt; } if (rsm->r_flags & BBR_HAS_FIN) { /* If its a FIN we don't do TLP */ rsm = NULL; goto activate_rxt; } time_since_sent = 0; idx = rsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; if (TSTMP_GT(tstmp_touse, cts)) time_since_sent = cts - tstmp_touse; is_tlp_timer = 1; srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use); thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts); if (thresh > time_since_sent) to = thresh - time_since_sent; else to = bbr->r_ctl.rc_min_to; if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. */ goto activate_rxt; } if ((bbr->rc_tlp_rtx_out == 1) && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) { /* * Second retransmit of the same TLP * lets not. */ bbr->rc_tlp_rtx_out = 0; goto activate_rxt; } if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) { /* * The tail is no longer the last one I did a probe * on */ bbr->r_ctl.rc_tlp_seg_send_cnt = 0; bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; } } if (is_tlp_timer == 0) { BBR_STAT_INC(bbr_to_arm_rack); bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; } else { bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to); if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { /* * We have exceeded how many times we can retran the * current TLP timer, switch to the RTO timer. */ goto activate_rxt; } else { BBR_STAT_INC(bbr_to_arm_tlp); bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; } } return (to); } static inline int32_t bbr_minseg(struct tcp_bbr *bbr) { return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options); } static void bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) { struct inpcb *inp; struct hpts_diag diag; uint32_t delayed_ack = 0; uint32_t left = 0; uint32_t hpts_timeout; uint8_t stopped; int32_t delay_calc = 0; uint32_t prev_delay = 0; inp = tp->t_inpcb; if (inp->inp_in_hpts) { /* A previous call is already set up */ return; } if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { return; } stopped = bbr->rc_tmr_stopped; if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) { left = bbr->r_ctl.rc_timer_exp - cts; } bbr->r_ctl.rc_hpts_flags = 0; bbr->r_ctl.rc_timer_exp = 0; prev_delay = bbr->r_ctl.rc_last_delay_val; if (bbr->r_ctl.rc_last_delay_val && (slot == 0)) { /* * If a previous pacer delay was in place we * are not coming from the output side (where * we calculate a delay, more likely a timer). */ slot = bbr->r_ctl.rc_last_delay_val; if (TSTMP_GT(cts, bbr->rc_pacer_started)) { /* Compensate for time passed */ delay_calc = cts - bbr->rc_pacer_started; if (delay_calc <= slot) slot -= delay_calc; } } /* Do we have early to make up for by pushing out the pacing time? */ if (bbr->r_agg_early_set) { bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); slot += bbr->r_ctl.rc_agg_early; bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; } /* Are we running a total debt that needs to be compensated for? */ if (bbr->r_ctl.rc_hptsi_agg_delay) { if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { /* We nuke the delay */ slot -= bbr->r_ctl.rc_hptsi_agg_delay; bbr->r_ctl.rc_hptsi_agg_delay = 0; } else { /* We nuke some of the delay, put in a minimal 100usecs */ bbr->r_ctl.rc_hptsi_agg_delay -= slot; bbr->r_ctl.rc_last_delay_val = slot = 100; } } bbr->r_ctl.rc_last_delay_val = slot; hpts_timeout = bbr_timer_start(tp, bbr, cts); if (tp->t_flags & TF_DELACK) { if (bbr->rc_in_persist == 0) { delayed_ack = bbr_delack_time; } else { /* * We are in persists and have * gotten a new data element. */ if (hpts_timeout > bbr_delack_time) { /* * Lets make the persists timer (which acks) * be the smaller of hpts_timeout and bbr_delack_time. */ hpts_timeout = bbr_delack_time; } } } if (delayed_ack && ((hpts_timeout == 0) || (delayed_ack < hpts_timeout))) { /* We need a Delayed ack timer */ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; hpts_timeout = delayed_ack; } if (slot) { /* Mark that we have a pacing timer up */ BBR_STAT_INC(bbr_paced_segments); bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; } /* * If no timers are going to run and we will fall off thfe hptsi * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && (slot == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* * Ok we have no timer (persists, rack, tlp, rxt or * del-ack), we don't have segments being paced. So * all that is left is the keepalive timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); } else { hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); } bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; } } if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { /* * RACK, TLP, persists and RXT timers all are restartable * based on actions input .. i.e we received a packet (ack * or sack) and that changes things (rw, or snd_una etc). * Thus we can restart them with a new value. For * keep-alive, delayed_ack we keep track of what was left * and restart the timer with a smaller value. */ if (left < hpts_timeout) hpts_timeout = left; } if (bbr->r_ctl.rc_incr_tmrs && slot && (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * If configured to do so, and the timer is either * the TLP or RXT timer, we need to increase the timeout * by the pacing time. Consider the bottleneck at my * machine as an example, we are sending something * to start a TLP on. The last packet won't be emitted * fully until the pacing time (the bottleneck will hold * the data in place). Once the packet is emitted that * is when we want to start waiting for the TLP. This * is most evident with hardware pacing (where the nic * is holding the packet(s) before emitting). But it * can also show up in the network so we do it for all * cases. Technically we would take off one packet from * this extra delay but this is easier and being more * conservative is probably better. */ hpts_timeout += slot; } if (hpts_timeout) { /* * Hack alert for now we can't time-out over 2147 seconds (a * bit more than 35min) */ if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; } else bbr->r_ctl.rc_timer_exp = 0; if ((slot) && (bbr->rc_use_google || bbr->output_error_seen || (slot <= hpts_timeout)) ) { /* * Tell LRO that it can queue packets while * we pace. */ bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (bbr->rc_cwnd_limited == 0)) { /* * If we are not cwnd limited and we * are running a rack timer we put on * the do not disturbe even for sack. */ inp->inp_flags2 |= INP_DONT_SACK_QUEUE; } else inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; bbr->rc_pacer_started = cts; (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), __LINE__, &diag); bbr->rc_timer_first = 0; bbr->bbr_timer_src = frm; bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); bbr_log_hpts_diag(bbr, cts, &diag); } else if (hpts_timeout) { (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); /* * We add the flag here as well if the slot is set, * since hpts will call in to clear the queue first before * calling the output routine (which does our timers). * We don't want to set the flag if its just a timer * else the arrival of data might (that causes us * to send more) might get delayed. Imagine being * on a keep-alive timer and a request comes in for * more data. */ if (slot) bbr->rc_pacer_started = cts; if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (bbr->rc_cwnd_limited == 0)) { /* * For a rack timer, don't wake us even * if a sack arrives as long as we are * not cwnd limited. */ bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; inp->inp_flags2 |= INP_DONT_SACK_QUEUE; } else { /* All other timers wake us up */ bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; } bbr->bbr_timer_src = frm; bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); bbr_log_hpts_diag(bbr, cts, &diag); bbr->rc_timer_first = 1; } bbr->rc_tmr_stopped = 0; bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); } static void bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb) { /* * We received an ack, and then did not call send or were bounced * out due to the hpts was running. Now a timer is up as well, is it * the right timer? */ struct inpcb *inp; struct bbr_sendmap *rsm; uint32_t hpts_timeout; int tmr_up; tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) return; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && (tmr_up == PACE_TMR_RXT)) { /* Should be an RXT */ return; } inp = bbr->rc_inp; if (rsm == NULL) { /* Nothing outstanding? */ if (tp->t_flags & TF_DELACK) { if (tmr_up == PACE_TMR_DELACK) /* * We are supposed to have delayed ack up * and we do */ return; } else if (sbavail(&inp->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { /* * if we hit enobufs then we would expect the * possiblity of nothing outstanding and the RXT up * (and the hptsi timer). */ return; } else if (((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && (tmr_up == PACE_TMR_KEEP) && (tp->snd_max == tp->snd_una)) { /* We should have keep alive up and we do */ return; } } if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & BBR_HAS_FIN)) { /* needs to be a RXT */ if (tmr_up == PACE_TMR_RXT) return; else goto wrong_timer; } else if (tmr_up == PACE_TMR_RACK) return; else goto wrong_timer; } else if (rsm && (tmr_up == PACE_TMR_RACK)) { /* Rack timer has priority if we have data out */ return; } else if (SEQ_GT(tp->snd_max, tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RXT))) { /* * Either a TLP or RXT is fine if no sack-passed is in place * and data is outstanding. */ return; } else if (tmr_up == PACE_TMR_DELACK) { /* * If the delayed ack was going to go off before the * rtx/tlp/rack timer were going to expire, then that would * be the timer in control. Note we don't check the time * here trusting the code is correct. */ return; } if (SEQ_GT(tp->snd_max, tp->snd_una) && ((tmr_up == PACE_TMR_RXT) || (tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RACK))) { /* * We have outstanding data and * we *do* have a RACK, TLP or RXT * timer running. We won't restart * anything here since thats probably ok we * will get called with some timer here shortly. */ return; } /* * Ok the timer originally started is not what we want now. We will * force the hpts to be stopped if any, and restart with the slot * set to what was in the saved slot. */ wrong_timer: if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { if (inp->inp_in_hpts) tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); bbr_timer_cancel(bbr, __LINE__, cts); bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val, 0); } else { /* * Output is hptsi so we just need to switch the type of * timer. We don't bother with keep-alive, since when we * jump through the output, it will start the keep-alive if * nothing is sent. * * We only need a delayed-ack added and or the hpts_timeout. */ hpts_timeout = bbr_timer_start(tp, bbr, cts); if (tp->t_flags & TF_DELACK) { if (hpts_timeout == 0) { hpts_timeout = bbr_delack_time; bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; } else if (hpts_timeout > bbr_delack_time) { hpts_timeout = bbr_delack_time; bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; } } if (hpts_timeout) { if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; } } } int32_t bbr_clear_lost = 0; /* * Considers the two time values now (cts) and earlier. * If cts is smaller than earlier, we could have * had a sequence wrap (our counter wraps every * 70 min or so) or it could be just clock skew * getting us two differnt time values. Clock skew * will show up within 10ms or so. So in such * a case (where cts is behind earlier time by * less than 10ms) we return 0. Otherwise we * return the true difference between them. */ static inline uint32_t bbr_calc_time(uint32_t cts, uint32_t earlier_time) { /* * Given two timestamps, the current time stamp cts, and some other * time-stamp taken in theory earlier return the difference. The * trick is here sometimes locking will get the other timestamp * after the cts. If this occurs we need to return 0. */ if (TSTMP_GEQ(cts, earlier_time)) return (cts - earlier_time); /* * cts is behind earlier_time if its less than 10ms consider it 0. * If its more than 10ms difference then we had a time wrap. Else * its just the normal locking foo. I wonder if we should not go to * 64bit TS and get rid of this issue. */ if (TSTMP_GEQ((cts + 10000), earlier_time)) return (0); /* * Ok the time must have wrapped. So we need to answer a large * amount of time, which the normal subtraction should do. */ return (cts - earlier_time); } static int sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS) { uint32_t stat; int32_t error; error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t)); if (error || req->newptr == NULL) return error; error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); if (error) return (error); if (stat == 1) { #ifdef BBR_INVARIANTS printf("Clearing BBR lost counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT); COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT); COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT); } else if (stat == 2) { #ifdef BBR_INVARIANTS printf("Clearing BBR option counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE); } else if (stat == 3) { #ifdef BBR_INVARIANTS printf("Clearing BBR stats counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE); } else if (stat == 4) { #ifdef BBR_INVARIANTS printf("Clearing BBR out-size counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE); } bbr_clear_lost = 0; return (0); } static void bbr_init_sysctls(void) { struct sysctl_oid *bbr_probertt; struct sysctl_oid *bbr_hptsi; struct sysctl_oid *bbr_measure; struct sysctl_oid *bbr_cwnd; struct sysctl_oid *bbr_timeout; struct sysctl_oid *bbr_states; struct sysctl_oid *bbr_startup; struct sysctl_oid *bbr_policer; /* Probe rtt controls */ bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "probertt", CTLFLAG_RW, 0, ""); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "gain", CTLFLAG_RW, &bbr_rttprobe_gain, 192, "What is the filter gain drop in probe_rtt (0=disable)?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "cwnd", CTLFLAG_RW, &bbr_rtt_probe_cwndtarg, 4, "How many mss's are outstanding during probe-rtt"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "int", CTLFLAG_RW, &bbr_rtt_probe_limit, 4000000, "If RTT has not shrank in this many micro-seconds enter probe-rtt"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "mintime", CTLFLAG_RW, &bbr_rtt_probe_time, 200000, "How many microseconds in probe-rtt"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "filter_len_sec", CTLFLAG_RW, &bbr_filter_len_sec, 6, "How long in seconds does the rttProp filter run?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "drain_rtt", CTLFLAG_RW, &bbr_drain_rtt, BBR_SRTT, "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "can_force", CTLFLAG_RW, &bbr_can_force_probertt, 0, "If we keep setting new low rtt's but delay going in probe-rtt can we force in??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "enter_sets_force", CTLFLAG_RW, &bbr_probertt_sets_rtt, 0, "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "can_adjust", CTLFLAG_RW, &bbr_can_adjust_probertt, 1, "Can we dynamically adjust the probe-rtt limits and times?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "is_ratio", CTLFLAG_RW, &bbr_is_ratio, 0, "is the limit to filter a ratio?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "use_cwnd", CTLFLAG_RW, &bbr_prtt_slam_cwnd, 0, "Should we set/recover cwnd?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "can_use_ts", CTLFLAG_RW, &bbr_can_use_ts_for_rtt, 1, "Can we use the ms timestamp if available for retransmistted rtt calculations?"); /* Pacing controls */ bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "pacing", CTLFLAG_RW, 0, ""); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing", CTLFLAG_RW, &bbr_allow_hdwr_pacing, 1, "Do we allow hardware pacing?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_limit", CTLFLAG_RW, &bbr_hardware_pacing_limit, 4000, "Do we have a limited number of connections for pacing chelsio (0=no limit)?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_adj", CTLFLAG_RW, &bbr_hdwr_pace_adjust, 2, "Multiplier to calculated tso size?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_floor", CTLFLAG_RW, &bbr_hdwr_pace_floor, 1, "Do we invoke the hardware pacing floor?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW, &bbr_hdwr_pacing_delay_cnt, 10, "How many packets must be sent after hdwr pacing is enabled"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "bw_cross", CTLFLAG_RW, &bbr_cross_over, 3000000, "What is the point where we cross over to linux like TSO size set"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_deltarg", CTLFLAG_RW, &bbr_hptsi_segments_delay_tar, 7000, "What is the worse case delay target for hptsi < 48Mbp connections"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "enet_oh", CTLFLAG_RW, &bbr_include_enet_oh, 0, "Do we include the ethernet overhead in calculating pacing delay?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "ip_oh", CTLFLAG_RW, &bbr_include_ip_oh, 1, "Do we include the IP overhead in calculating pacing delay?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "tcp_oh", CTLFLAG_RW, &bbr_include_tcp_oh, 0, "Do we include the TCP overhead in calculating pacing delay?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "google_discount", CTLFLAG_RW, &bbr_google_discount, 10, "What is the default google discount percentage wise for pacing (11 = 1.1%%)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "all_get_min", CTLFLAG_RW, &bbr_all_get_min, 0, "If you are less than a MSS do you just get the min?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "tso_min", CTLFLAG_RW, &bbr_hptsi_bytes_min, 1460, "For 0 -> 24Mbps what is floor number of segments for TSO"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_tso_max", CTLFLAG_RW, &bbr_hptsi_segments_max, 6, "For 0 -> 24Mbps what is top number of segments for TSO"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_floor", CTLFLAG_RW, &bbr_hptsi_segments_floor, 1, "Minimum TSO size we will fall too in segments"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "utter_max", CTLFLAG_RW, &bbr_hptsi_utter_max, 0, "The absolute maximum that any pacing (outside of hardware) can be"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_divisor", CTLFLAG_RW, &bbr_hptsi_per_second, 100, "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps "); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "srtt_mul", CTLFLAG_RW, &bbr_hptsi_max_mul, 1, "The multiplier for pace len max"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "srtt_div", CTLFLAG_RW, &bbr_hptsi_max_div, 2, "The divisor for pace len max"); /* Measurement controls */ bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "measure", CTLFLAG_RW, 0, "Measurement controls"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "min_i_bw", CTLFLAG_RW, &bbr_initial_bw_bps, 62500, "Minimum initial b/w in bytes per second"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "no_sack_needed", CTLFLAG_RW, &bbr_sack_not_required, 0, "Do we allow bbr to run on connections not supporting SACK?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "use_google", CTLFLAG_RW, &bbr_use_google_algo, 0, "Use has close to google V1.0 has possible?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_limiting", CTLFLAG_RW, &bbr_ts_limiting, 1, "Do we attempt to use the peers timestamp to limit b/w caculations?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_can_raise", CTLFLAG_RW, &bbr_ts_can_raise, 0, "Can we raise the b/w via timestamp b/w calculation?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_delta", CTLFLAG_RW, &bbr_min_usec_delta, 20000, "How long in usec between ts of our sends in ts validation code?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_peer_delta", CTLFLAG_RW, &bbr_min_peer_delta, 20, "What min numerical value should be between the peer deltas?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_delta_percent", CTLFLAG_RW, &bbr_delta_percent, 150, "What percentage (150 = 15.0) do we allow variance for?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "min_measure_good_bw", CTLFLAG_RW, &bbr_min_measurements_req, 1, "What is the minimum measurment count we need before we switch to our b/w estimate"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "min_measure_before_pace", CTLFLAG_RW, &bbr_no_pacing_until, 4, "How many pkt-epoch's (0 is off) do we need before pacing is on?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "quanta", CTLFLAG_RW, &bbr_quanta, 2, "Extra quanta to add when calculating the target (ID section 4.2.3.2)."); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "noretran", CTLFLAG_RW, &bbr_no_retran, 0, "Should google mode not use retransmission measurements for the b/w estimation?"); /* State controls */ bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "states", CTLFLAG_RW, 0, "State controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "idle_restart", CTLFLAG_RW, &bbr_uses_idle_restart, 0, "Do we use a new special idle_restart state to ramp back up quickly?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "idle_restart_threshold", CTLFLAG_RW, &bbr_idle_restart_threshold, 100000, "How long must we be idle before we restart??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "use_pkt_epoch", CTLFLAG_RW, &bbr_state_is_pkt_epoch, 0, "Do we use a pkt-epoch for substate if 0 rttProp?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "startup_rtt_gain", CTLFLAG_RW, &bbr_rtt_gain_thresh, 0, "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "drain_floor", CTLFLAG_RW, &bbr_drain_floor, 88, "What is the lowest we can drain (pg) too?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "drain_2_target", CTLFLAG_RW, &bbr_state_drain_2_tar, 1, "Do we drain to target in drain substate?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "gain_2_target", CTLFLAG_RW, &bbr_gain_to_target, 1, "Does probe bw gain to target??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "gain_extra_time", CTLFLAG_RW, &bbr_gain_gets_extra_too, 1, "Does probe bw gain get the extra time too?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "ld_div", CTLFLAG_RW, &bbr_drain_drop_div, 5, "Long drain drop divider?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "ld_mul", CTLFLAG_RW, &bbr_drain_drop_mul, 4, "Long drain drop multiplier?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "rand_ot_disc", CTLFLAG_RW, &bbr_rand_ot, 50, "Random discount of the ot?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "dr_filter_life", CTLFLAG_RW, &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT, "How many packet-epochs does the b/w delivery rate last?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "subdrain_applimited", CTLFLAG_RW, &bbr_sub_drain_app_limit, 0, "Does our sub-state drain invoke app limited if its long?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW, &bbr_sub_drain_slam_cwnd, 0, "Should we set/recover cwnd for sub-state drain?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW, &bbr_slam_cwnd_in_main_drain, 0, "Should we set/recover cwnd for main-state drain?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "google_gets_earlyout", CTLFLAG_RW, &google_allow_early_out, 1, "Should we allow google probe-bw/drain to exit early at flight target?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "google_exit_loss", CTLFLAG_RW, &google_consider_lost, 1, "Should we have losses exit gain of probebw in google mode??"); /* Startup controls */ bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "startup", CTLFLAG_RW, 0, "Startup controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "cheat_iwnd", CTLFLAG_RW, &bbr_sends_full_iwnd, 1, "Do we not pace but burst out initial windows has our TSO size?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "loss_threshold", CTLFLAG_RW, &bbr_startup_loss_thresh, 2000, "In startup what is the loss threshold in a pe that will exit us from startup?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "use_lowerpg", CTLFLAG_RW, &bbr_use_lower_gain_in_startup, 1, "Should we use a lower hptsi gain if we see loss in startup?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "gain", CTLFLAG_RW, &bbr_start_exit, 25, "What gain percent do we need to see to stay in startup??"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "low_gain", CTLFLAG_RW, &bbr_low_start_exit, 15, "What gain percent do we need to see to stay in the lower gain startup??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "loss_exit", CTLFLAG_RW, &bbr_exit_startup_at_loss, 1, "Should we exit startup at loss in an epoch if we are not gaining?"); /* CWND controls */ bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "cwnd", CTLFLAG_RW, 0, "Cwnd controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "tar_rtt", CTLFLAG_RW, &bbr_cwndtarget_rtt_touse, 0, "Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "may_shrink", CTLFLAG_RW, &bbr_cwnd_may_shrink, 0, "Can the cwnd shrink if it would grow to more than the target?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "max_target_limit", CTLFLAG_RW, &bbr_target_cwnd_mult_limit, 8, "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "highspeed_min", CTLFLAG_RW, &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS, "What is the high-speed min cwnd (rttProp under 1ms)"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "lowspeed_min", CTLFLAG_RW, &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS, "What is the min cwnd (rttProp > 1ms)"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "initwin", CTLFLAG_RW, &bbr_def_init_win, 10, "What is the BBR initial window, if 0 use tcp version"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "do_loss_red", CTLFLAG_RW, &bbr_do_red, 600, "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_scale", CTLFLAG_RW, &bbr_red_scale, 20000, "What RTT do we scale with?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_growslow", CTLFLAG_RW, &bbr_red_growth_restrict, 1, "Do we restrict cwnd growth for whats in flight?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_div", CTLFLAG_RW, &bbr_red_div, 2, "If we reduce whats the divisor?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_mul", CTLFLAG_RW, &bbr_red_mul, 1, "If we reduce whats the mulitiplier?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "target_is_unit", CTLFLAG_RW, &bbr_target_is_bbunit, 0, "Is the state target the pacing_gain or BBR_UNIT?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "drop_limit", CTLFLAG_RW, &bbr_drop_limit, 0, "Number of segments limit for drop (0=use min_cwnd w/flight)?"); /* Timeout controls */ bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "timeout", CTLFLAG_RW, 0, "Time out controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "delack", CTLFLAG_RW, &bbr_delack_time, 100000, "BBR's delayed ack time"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_uses", CTLFLAG_RW, &bbr_tlp_type_to_use, 3, "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "persmin", CTLFLAG_RW, &bbr_persist_min, 250000, "What is the minimum time in microseconds between persists"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "persmax", CTLFLAG_RW, &bbr_persist_max, 1000000, "What is the largest delay in microseconds between persists"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_minto", CTLFLAG_RW, &bbr_tlp_min, 10000, "TLP Min timeout in usecs"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_dack_time", CTLFLAG_RW, &bbr_delayed_ack_time, 200000, "TLP delayed ack compensation value"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "minrto", CTLFLAG_RW, &bbr_rto_min_ms, 30, "Minimum RTO in ms"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "maxrto", CTLFLAG_RW, &bbr_rto_max_sec, 4, "Maxiumum RTO in seconds -- should be at least as large as min_rto"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_retry", CTLFLAG_RW, &bbr_tlp_max_resend, 2, "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "minto", CTLFLAG_RW, &bbr_min_to, 1000, "Minimum rack timeout in useconds"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "pktdelay", CTLFLAG_RW, &bbr_pkt_delay, 1000, "Extra RACK time (in useconds) besides reordering thresh"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "incr_tmrs", CTLFLAG_RW, &bbr_incr_timers, 1, "Increase the RXT/TLP timer by the pacing time used?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW, &bbr_marks_rxt_sack_passed, 0, "Mark sack passed on all those not ack'd when a RXT hits?"); /* Policer controls */ bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "policer", CTLFLAG_RW, 0, "Policer controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "detect_enable", CTLFLAG_RW, &bbr_policer_detection_enabled, 1, "Is policer detection enabled??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "min_pes", CTLFLAG_RW, &bbr_lt_intvl_min_rtts, 4, "Minimum number of PE's?"); SYSCTL_ADD_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "bwdiff", CTLFLAG_RW, &bbr_lt_bw_diff, (4000/8), "Minimal bw diff?"); SYSCTL_ADD_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "bwratio", CTLFLAG_RW, &bbr_lt_bw_ratio, 8, "Minimal bw diff?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "from_rack_rxt", CTLFLAG_RW, &bbr_policer_call_from_rack_to, 0, "Do we call the policer detection code from a rack-timeout?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "false_postive", CTLFLAG_RW, &bbr_lt_intvl_fp, 0, "What packet epoch do we do false-postive detection at (0=no)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "loss_thresh", CTLFLAG_RW, &bbr_lt_loss_thresh, 196, "Loss threshold 196 = 19.6%?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "false_postive_thresh", CTLFLAG_RW, &bbr_lt_fd_thresh, 100, "What percentage is the false detection threshold (150=15.0)?"); /* All the rest */ SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "cheat_rxt", CTLFLAG_RW, &bbr_use_rack_resend_cheat, 0, "Do we burst 1ms between sends on retransmissions (like rack)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "error_paceout", CTLFLAG_RW, &bbr_error_base_paceout, 10000, "When we hit an error what is the min to pace out in usec's?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "kill_paceout", CTLFLAG_RW, &bbr_max_net_error_cnt, 10, "When we hit this many errors in a row, kill the session?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "data_after_close", CTLFLAG_RW, &bbr_ignore_data_after_close, 1, "Do we hold off sending a RST until all pending data is ack'd"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "resend_use_tso", CTLFLAG_RW, &bbr_resends_use_tso, 0, "Can resends use TSO?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "sblklimit", CTLFLAG_RW, &bbr_sack_block_limit, 128, "When do we start ignoring small sack blocks"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "bb_verbose", CTLFLAG_RW, &bbr_verbose_logging, 0, "Should BBR black box logging be verbose"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "reorder_thresh", CTLFLAG_RW, &bbr_reorder_thresh, 2, "What factor for rack will be added when seeing reordering (shift right)"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "reorder_fade", CTLFLAG_RW, &bbr_reorder_fade, 0, "Does reorder detection fade, if so how many ms (0 means never)"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, &bbr_tlp_thresh, 1, "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); /* Stats and counters */ /* The pacing counters for hdwr/software can't be in the array */ bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD, &bbr_hdwr_pacing_enobuf, "Total number of enobufs for hardware paced flows"); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD, &bbr_nohdwr_pacing_enobuf, "Total number of enobufs for non-hardware paced flows"); bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "hdwr_pacing", CTLFLAG_RD, &bbr_flows_whdwr_pacing, "Total number of hardware paced flows"); bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "software_pacing", CTLFLAG_RD, &bbr_flows_nohdwr_pacing, "Total number of software paced flows"); COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "stats", CTLFLAG_RD, bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats"); COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "opts", CTLFLAG_RD, bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats"); COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "lost", CTLFLAG_RD, bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur"); COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "stateresend", CTLFLAG_RD, bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend"); COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "statetime", CTLFLAG_RD, bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states"); COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls"); SYSCTL_ADD_PROC(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters"); } static inline int32_t bbr_progress_timeout_check(struct tcp_bbr *bbr) { if (bbr->rc_tp->t_maxunacktime && bbr->rc_tp->t_acktime && TSTMP_GT(ticks, bbr->rc_tp->t_acktime)) { if ((((uint32_t)ticks - bbr->rc_tp->t_acktime)) >= bbr->rc_tp->t_maxunacktime) { /* * There is an assumption here that the caller will * drop the connection, so we increment the * statistics. */ bbr_log_progress_event(bbr, bbr->rc_tp, ticks, PROGRESS_DROP, __LINE__); BBR_STAT_INC(bbr_progress_drops); #ifdef NETFLIX_STATS TCPSTAT_INC(tcps_progdrops); #endif return (1); } } return (0); } static void bbr_counter_destroy(void) { COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE); COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE); COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT); COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT); COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT); counter_u64_free(bbr_flows_whdwr_pacing); counter_u64_free(bbr_flows_nohdwr_pacing); } static __inline void bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts) { memset(l, 0, sizeof(union tcp_log_stackspecific)); l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate; l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate); l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop); l->bw_inuse = bbr_get_bw(bbr); l->inflight = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); l->applimited = bbr->r_ctl.r_app_limited_until; l->delivered = bbr->r_ctl.rc_delivered; l->timeStamp = cts; l->lost = bbr->r_ctl.rc_lost; l->bbr_state = bbr->rc_bbr_state; l->bbr_substate = bbr_state_val(bbr); l->epoch = bbr->r_ctl.rc_rtt_epoch; l->lt_epoch = bbr->r_ctl.rc_lt_epoch; l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain; l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain; l->inhpts = bbr->rc_inp->inp_in_hpts; l->ininput = bbr->rc_inp->inp_in_input; l->use_lt_bw = bbr->rc_lt_use_bw; l->pkts_out = bbr->r_ctl.rc_flight_at_input; l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch; } static void bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = 0; log.u_bbr.flex2 = 0; log.u_bbr.flex5 = 0; log.u_bbr.flex3 = 0; log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate; log.u_bbr.flex7 = reason; log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt; log.u_bbr.flex8 = 0; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BW_RED_EV, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = seq; log.u_bbr.flex2 = count; log.u_bbr.flex8 = mode; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_LOWGAIN, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling, uint8_t reason, uint32_t p_maxseg, int len) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = p_maxseg; log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; log.u_bbr.flex4 = reason; log.u_bbr.flex5 = bbr->rc_in_persist; log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val; log.u_bbr.flex7 = p_maxseg; log.u_bbr.flex8 = bbr->rc_in_persist; log.u_bbr.pkts_out = 0; log.u_bbr.applimited = len; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, tlen, &log, false, &bbr->rc_tv); } } static void bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = seq; log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_ENTREC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = tso; log.u_bbr.flex2 = maxseg; log.u_bbr.flex3 = mtu; log.u_bbr.flex4 = csum_flags; TCP_LOG_EVENTP(tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_MSGSIZE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_flowend(struct tcp_bbr *bbr) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct sockbuf *r, *s; struct timeval tv; if (bbr->rc_inp->inp_socket) { r = &bbr->rc_inp->inp_socket->so_rcv; s = &bbr->rc_inp->inp_socket->so_snd; } else { r = s = NULL; } bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv)); TCP_LOG_EVENTP(bbr->rc_tp, NULL, r, s, TCP_LOG_FLOWEND, 0, 0, &log, false, &tv); } } static void bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t lost, uint32_t del) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = lost; log.u_bbr.flex2 = del; log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw; log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt; log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; log.u_bbr.flex7 = line; log.u_bbr.flex8 = 0; log.u_bbr.inflight = bbr->r_ctl.r_measurement_count; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_PKT_EPOCH, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->r_ctl.rc_lost; log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat; log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat; log.u_bbr.flex7 = line; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIME_EPOCH, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex2 = new_tar; log.u_bbr.flex3 = line; log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex5 = bbr_quanta; log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs; log.u_bbr.flex7 = bbr->rc_last_options; log.u_bbr.flex8 = meth; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_STATE_TARGET, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; if (bbr_state_is_pkt_epoch) log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); else log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP); log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000); log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra; log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_STATE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt; log.u_bbr.flex4 = applied; log.u_bbr.flex5 = rtt; log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex7 = cond; log.u_bbr.flex8 = reas; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_RTT_SHRINKS, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_exit_rec(struct tcp_bbr *bbr) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start; log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_EXITREC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg, uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = line; log.u_bbr.flex2 = prev_acked; log.u_bbr.flex3 = bytes_this_ack; log.u_bbr.flex4 = chg; log.u_bbr.flex5 = th_ack; log.u_bbr.flex6 = target; log.u_bbr.flex8 = meth; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_CWND, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin) { /* * Log the rtt sample we are applying to the srtt algorithm in * useconds. */ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = rtt; log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time; log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay; log.u_bbr.flex4 = bbr->rc_tp->ts_offset; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv); log.u_bbr.flex6 = tsin; log.u_bbr.flex7 = 0; log.u_bbr.flex8 = bbr->rc_ack_was_delayed; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, TCP_LOG_RTT, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = time_in; log.u_bbr.flex2 = line; log.u_bbr.flex8 = enter_exit; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_PERSIST, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age; log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_ACKCLEAR, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen, uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = nsegs; log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes; if (m) { struct timespec ts; log.u_bbr.flex3 = m->m_flags; if (m->m_flags & M_TSTMP) { mbuf_tstmp2timespec(m, &ts); tv.tv_sec = ts.tv_sec; tv.tv_usec = ts.tv_nsec / 1000; log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv); } else { log.u_bbr.lt_epoch = 0; } if (m->m_flags & M_TSTMP_LRO) { tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; log.u_bbr.flex5 = tcp_tv_to_usectick(&tv); } else { /* No arrival timestamp */ log.u_bbr.flex5 = 0; } log.u_bbr.pkts_out = tcp_get_usecs(&tv); } else { log.u_bbr.flex3 = 0; log.u_bbr.flex5 = 0; log.u_bbr.flex6 = 0; log.u_bbr.pkts_out = 0; } log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex7 = bbr->r_wanted_output; log.u_bbr.flex8 = bbr->rc_in_persist; TCP_LOG_EVENTP(bbr->rc_tp, th, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, TCP_LOG_IN, 0, tlen, &log, true, &bbr->rc_tv); } } static void bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val; log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes; log.u_bbr.flex7 = bbr->r_wanted_output; log.u_bbr.flex8 = bbr->rc_in_persist; log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, 0, &log, true, &bbr->rc_tv); } } static void bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts, int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = o_len; log.u_bbr.flex3 = segcnt; log.u_bbr.flex4 = segsiz; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_ENOBUF_JMP, ENOBUFS, len, &log, true, &bbr->rc_tv); } } static void bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex8 = hpts_calling; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; uint64_t ar; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; ar = (uint64_t)(bbr->r_ctl.rc_resend); ar >>= 32; ar &= 0x00000000ffffffff; log.u_bbr.flex4 = (uint32_t)ar; ar = (uint64_t)bbr->r_ctl.rc_resend; ar &= 0x00000000ffffffff; log.u_bbr.flex5 = (uint32_t)ar; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.flex8 = to_num; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = flex2; log.u_bbr.flex3 = flex3; log.u_bbr.flex4 = 0; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; log.u_bbr.flex8 = reason; log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_REDUCE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = diag->p_nxt_slot; log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; log.u_bbr.flex5 = diag->slot_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; /* Hijack other fields as needed */ log.u_bbr.epoch = diag->have_slept; log.u_bbr.lt_epoch = diag->yet_to_sleep; log.u_bbr.pkts_out = diag->co_ret; log.u_bbr.applimited = diag->hpts_sleep_time; log.u_bbr.delivered = diag->p_prev_slot; log.u_bbr.inflight = diag->p_runningtick; log.u_bbr.bw_inuse = diag->wheel_tick; log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.delRate = diag->maxticks; log.u_bbr.cur_del_rate = diag->p_curtick; log.u_bbr.cur_del_rate <<= 32; log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_HPTSDIAG, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt, uint32_t thresh, uint32_t to) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->rc_tp->t_rttvar; log.u_bbr.flex2 = time_since_sent; log.u_bbr.flex3 = srtt; log.u_bbr.flex4 = thresh; log.u_bbr.flex5 = to; log.u_bbr.flex6 = bbr->rc_tp->t_srtt; log.u_bbr.flex8 = mode; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERPREP, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = usecs; log.u_bbr.flex2 = len; log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff); log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff); if (override) log.u_bbr.flex5 = (1 << 2); else log.u_bbr.flex5 = 0; log.u_bbr.flex6 = override; log.u_bbr.flex7 = gain; log.u_bbr.flex8 = mod; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_HPTSI_CALC, 0, len, &log, false, &bbr->rc_tv); } } static void bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = to; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2; log.u_bbr.flex8 = which; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = thresh; log.u_bbr.flex2 = lro; log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts; log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.flex6 = srtt; log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift; log.u_bbr.flex8 = frm; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_THRESH_CALC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = bbr->bbr_timer_src; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = bbr->rc_in_persist; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.flex8 = hpts_removed; log.u_bbr.pkts_out = bbr->rc_pacer_started; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio; log.u_bbr.flex2 = (peer_delta >> 32); log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff); log.u_bbr.flex4 = (delta >> 32); log.u_bbr.flex5 = (delta & 0x00000000ffffffff); log.u_bbr.flex7 = bbr->rc_ts_clock_set; log.u_bbr.flex8 = bbr->rc_ts_cant_be_used; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TSTMP_VAL, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = tsosz; log.u_bbr.flex2 = tls; log.u_bbr.flex3 = tcp_min_hptsi_time; log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min; log.u_bbr.flex5 = old_val; log.u_bbr.flex6 = maxseg; log.u_bbr.flex7 = bbr->rc_no_pacing; log.u_bbr.flex7 <<= 1; log.u_bbr.flex7 |= bbr->rc_past_init_win; if (hdwr) log.u_bbr.flex8 = 0x80 | bbr->rc_use_google; else log.u_bbr.flex8 = bbr->rc_use_google; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRTSO, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, uint32_t flags, uint32_t line) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rsm->r_start; log.u_bbr.flex3 = rsm->r_end; log.u_bbr.flex4 = rsm->r_delivered; log.u_bbr.flex5 = rsm->r_rtr_cnt; log.u_bbr.flex6 = rsm->r_dupack; log.u_bbr.flex7 = rsm->r_tim_lastsent[0]; log.u_bbr.flex8 = rsm->r_flags; /* Hijack the pkts_out fids */ log.u_bbr.applimited = flags; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_RSM_CLEARED, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts, uint32_t flex3, uint32_t flex2, uint32_t flex5, uint32_t flex6, uint32_t pkts_out, int flex7, uint32_t flex4, uint32_t flex1) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = flex2; log.u_bbr.flex3 = flex3; log.u_bbr.flex4 = flex4; log.u_bbr.flex5 = flex5; log.u_bbr.flex6 = flex6; log.u_bbr.flex7 = flex7; /* Hijack the pkts_out fids */ log.u_bbr.pkts_out = pkts_out; log.u_bbr.flex8 = flex8; if (bbr->rc_ack_was_delayed) log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay; else log.u_bbr.epoch = 0; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRUPD, 0, flex2, &log, false, &bbr->rc_tv); } } static void bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason, uint32_t newbw, uint32_t obw, uint32_t diff, uint32_t tim) { if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = reason; log.u_bbr.flex2 = newbw; log.u_bbr.flex3 = obw; log.u_bbr.flex4 = diff; log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost; log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del; log.u_bbr.flex7 = bbr->rc_lt_is_sampling; log.u_bbr.pkts_out = tim; log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw; if (bbr->rc_lt_use_bw == 0) log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; else log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BWSAMP, 0, 0, &log, false, &bbr->rc_tv); } } static inline void bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, uint64_t rate, uint64_t hw_rate, int line, uint32_t cts, int error) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); log.u_bbr.bw_inuse = rate; log.u_bbr.flex5 = line; log.u_bbr.flex6 = error; log.u_bbr.flex8 = bbr->skip_gain; log.u_bbr.flex8 <<= 1; log.u_bbr.flex8 |= bbr->gain_is_limited; log.u_bbr.flex8 <<= 1; log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing; log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_HDWR_PACE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = slot; log.u_bbr.flex2 = del_by; log.u_bbr.flex3 = prev_delay; log.u_bbr.flex4 = line; log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val; log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay; log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = bbr->rc_in_persist; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, len, &log, false, &bbr->rc_tv); } } static void bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->r_ctl.rc_delivered; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt; log.u_bbr.flex4 = end; log.u_bbr.flex5 = seq; log.u_bbr.flex6 = t; log.u_bbr.flex7 = match; log.u_bbr.flex8 = flags; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options); log.u_bbr.flex3 = bbr->r_ctl.gain_epoch; log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs; log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight; log.u_bbr.flex7 = 0; log.u_bbr.flex8 = entry_method; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_EXIT_GAIN, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); /* R-HU */ log.u_bbr.flex1 = 0; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = 0; log.u_bbr.flex4 = 0; log.u_bbr.flex7 = 0; log.u_bbr.flex8 = settings_desired; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_SETTINGS_CHG, 0, 0, &log, false, &bbr->rc_tv); } } /* * Returns the bw from the our filter. */ static inline uint64_t bbr_get_full_bw(struct tcp_bbr *bbr) { uint64_t bw; bw = get_filter_value(&bbr->r_ctl.rc_delrate); return (bw); } static inline void bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { uint64_t calclr; uint32_t lost, del; if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch) lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch; else lost = 0; del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del; if (lost == 0) { calclr = 0; } else if (del) { calclr = lost; calclr *= (uint64_t)1000; calclr /= (uint64_t)del; } else { /* Nothing delivered? 100.0% loss */ calclr = 1000; } bbr->r_ctl.rc_pkt_epoch_loss_rate = (uint32_t)calclr; if (IN_RECOVERY(bbr->rc_tp->t_flags)) bbr->r_ctl.recovery_lr += (uint32_t)calclr; bbr->r_ctl.rc_pkt_epoch++; if (bbr->rc_no_pacing && (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) { bbr->rc_no_pacing = 0; tcp_bbr_tso_size_check(bbr, cts); } bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time); bbr->r_ctl.rc_pkt_epoch_time = cts; /* What was our loss rate */ bbr_log_pkt_epoch(bbr, cts, line, lost, del); bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered; bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost; } static inline void bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { uint32_t epoch_time; /* Tick the RTT clock */ bbr->r_ctl.rc_rtt_epoch++; epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start; bbr_log_time_epoch(bbr, cts, line, epoch_time); bbr->r_ctl.rc_rcv_epoch_start = cts; } static inline void bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked) { if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) { bbr->rc_is_pkt_epoch_now = 1; } } /* * Returns the bw from either the b/w filter * or from the lt_bw (if the connection is being * policed). */ static inline uint64_t __bbr_get_bw(struct tcp_bbr *bbr) { uint64_t bw, min_bw; uint64_t rtt; int gm_measure_cnt = 1; /* * For startup we make, like google, a * minimum b/w. This is generated from the * IW and the rttProp. We do fall back to srtt * if for some reason (initial handshake) we don't * have a rttProp. We, in the worst case, fall back * to the configured min_bw (rc_initial_hptsi_bw). */ if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { /* Attempt first to use rttProp */ rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (rtt && (rtt < 0xffffffff)) { measure: min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * ((uint64_t)1000000); min_bw /= rtt; if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) { min_bw = bbr->r_ctl.rc_initial_hptsi_bw; } } else if (bbr->rc_tp->t_srtt != 0) { /* No rttProp, use srtt? */ rtt = bbr_get_rtt(bbr, BBR_SRTT); goto measure; } else { min_bw = bbr->r_ctl.rc_initial_hptsi_bw; } } else min_bw = 0; if ((bbr->rc_past_init_win == 0) && (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp))) bbr->rc_past_init_win = 1; if ((bbr->rc_use_google) && (bbr->r_ctl.r_measurement_count >= 1)) gm_measure_cnt = 0; if (gm_measure_cnt && ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) || (bbr->rc_past_init_win == 0))) { /* For google we use our guess rate until we get 1 measurement */ use_initial_window: rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (rtt && (rtt < 0xffffffff)) { /* * We have an RTT measurment. Use that in * combination with our initial window to calculate * a b/w. */ bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * ((uint64_t)1000000); bw /= rtt; if (bw < bbr->r_ctl.rc_initial_hptsi_bw) { bw = bbr->r_ctl.rc_initial_hptsi_bw; } } else { /* Drop back to the 40 and punt to a default */ bw = bbr->r_ctl.rc_initial_hptsi_bw; } if (bw < 1) /* Probably should panic */ bw = 1; if (bw > min_bw) return (bw); else return (min_bw); } if (bbr->rc_lt_use_bw) bw = bbr->r_ctl.rc_lt_bw; else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0)) bw = bbr->r_ctl.red_bw; else bw = get_filter_value(&bbr->r_ctl.rc_delrate); if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) { /* * Enforce user set rate limit, keep in mind that * t_peakrate_thr is in B/s already */ bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw); } if (bw == 0) { /* We should not be at 0, go to the initial window then */ goto use_initial_window; } if (bw < 1) /* Probably should panic */ bw = 1; if (bw < min_bw) bw = min_bw; return (bw); } static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr) { uint64_t bw; bw = __bbr_get_bw(bbr); return (bw); } static inline void bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts) { bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch; bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time; bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } static inline void bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts) { bbr->rc_lt_is_sampling = 0; bbr->rc_lt_use_bw = 0; bbr->r_ctl.rc_lt_bw = 0; bbr_reset_lt_bw_interval(bbr, cts); } static inline void bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin) { uint64_t diff; /* Do we have a previous sample? */ if (bbr->r_ctl.rc_lt_bw) { /* Get the diff in bytes per second */ if (bbr->r_ctl.rc_lt_bw > bw) diff = bbr->r_ctl.rc_lt_bw - bw; else diff = bw - bbr->r_ctl.rc_lt_bw; if ((diff <= bbr_lt_bw_diff) || (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) { /* Consider us policed */ uint32_t saved_bw; saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw; bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2; /* average of two */ bbr->rc_lt_use_bw = 1; bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; /* * Use pkt based epoch for measuring length of * policer up */ bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch; /* * reason 4 is we need to start consider being * policed */ bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin); return; } } bbr->r_ctl.rc_lt_bw = bw; bbr_reset_lt_bw_interval(bbr, cts); bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin); } /* * RRS: Copied from user space! * Calculate a uniformly distributed random number less than upper_bound * avoiding "modulo bias". * * Uniformity is achieved by generating new random numbers until the one * returned is outside the range [0, 2**32 % upper_bound). This * guarantees the selected random number will be inside * [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound) * after reduction modulo upper_bound. */ static uint32_t arc4random_uniform(uint32_t upper_bound) { uint32_t r, min; if (upper_bound < 2) return 0; /* 2**32 % x == (2**32 - x) % x */ min = -upper_bound % upper_bound; /* * This could theoretically loop forever but each retry has * p > 0.5 (worst case, usually far better) of selecting a * number inside the range we need, so it should rarely need * to re-roll. */ for (;;) { r = arc4random(); if (r >= min) break; } return r % upper_bound; } static void bbr_randomize_extra_state_time(struct tcp_bbr *bbr) { uint32_t ran, deduct; ran = arc4random_uniform(bbr_rand_ot); if (ran) { deduct = bbr->r_ctl.rc_level_state_extra / ran; bbr->r_ctl.rc_level_state_extra -= deduct; } } /* * Return randomly the starting state * to use in probebw. */ static uint8_t bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts) { uint32_t ran; uint8_t ret_val; /* Initialize the offset to 0 */ bbr->r_ctl.rc_exta_time_gd = 0; bbr->rc_hit_state_1 = 0; bbr->r_ctl.rc_level_state_extra = 0; ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1)); /* * The math works funny here :) the return value is used to set the * substate and then the state change is called which increments by * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when * we fully enter the state. Note that the (8 - 1 - ran) assures that * we return 1 - 7, so we dont return 0 and end up starting in * state 1 (DRAIN). */ ret_val = BBR_SUBSTATE_COUNT - 1 - ran; /* Set an epoch */ if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) bbr_set_epoch(bbr, cts, __LINE__); bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; return (ret_val); } static void bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected) { uint32_t diff, d_time; uint64_t del_time, bw, lost, delivered; if (bbr->r_use_policer == 0) return; if (bbr->rc_lt_use_bw) { /* We are using lt bw do we stop yet? */ diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; if (diff > bbr_lt_bw_max_rtts) { /* Reset it all */ reset_all: bbr_reset_lt_bw_sampling(bbr, cts); if (bbr->rc_filled_pipe) { bbr_set_epoch(bbr, cts, __LINE__); bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr_substate_change(bbr, cts, __LINE__, 0); bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr_log_type_statechange(bbr, cts, __LINE__); } else { /* * This should not happen really * unless we remove the startup/drain * restrictions above. */ bbr->rc_bbr_state = BBR_STATE_STARTUP; bbr_set_epoch(bbr, cts, __LINE__); bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; bbr_set_state_target(bbr, __LINE__); bbr_log_type_statechange(bbr, cts, __LINE__); } /* reason 0 is to stop using lt-bw */ bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0); return; } if (bbr_lt_intvl_fp == 0) { /* Not doing false-postive detection */ return; } /* False positive detection */ if (diff == bbr_lt_intvl_fp) { /* At bbr_lt_intvl_fp we record the lost */ bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) { /* Now is our loss rate still high? */ lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; if ((delivered == 0) || (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) { /* No still below our threshold */ bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0); } else { /* Yikes its still high, it must be a false positive */ bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0); goto reset_all; } } return; } /* * Wait for the first loss before sampling, to let the policer * exhaust its tokens and estimate the steady-state rate allowed by * the policer. Starting samples earlier includes bursts that * over-estimate the bw. */ if (bbr->rc_lt_is_sampling == 0) { /* reason 1 is to begin doing the sampling */ if (loss_detected == 0) return; bbr_reset_lt_bw_interval(bbr, cts); bbr->rc_lt_is_sampling = 1; bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0); return; } /* Now how long were we delivering long term last> */ if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time)) d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time; else d_time = 0; /* To avoid underestimates, reset sampling if we run out of data. */ if (bbr->r_ctl.r_app_limited_until) { /* Can not measure in app-limited state */ bbr_reset_lt_bw_sampling(bbr, cts); /* reason 2 is to reset sampling due to app limits */ bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time); return; } diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; if (diff < bbr_lt_intvl_min_rtts) { /* * need more samples (we don't * start on a round like linux so * we need 1 more). */ /* 6 is not_enough time or no-loss */ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); return; } if (diff > (4 * bbr_lt_intvl_min_rtts)) { /* * For now if we wait too long, reset all sampling. We need * to do some research here, its possible that we should * base this on how much loss as occurred.. something like * if its under 10% (or some thresh) reset all otherwise * don't. Thats for phase II I guess. */ bbr_reset_lt_bw_sampling(bbr, cts); /* reason 3 is to reset sampling due too long of sampling */ bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); return; } /* * End sampling interval when a packet is lost, so we estimate the * policer tokens were exhausted. Stopping the sampling before the * tokens are exhausted under-estimates the policed rate. */ if (loss_detected == 0) { /* 6 is not_enough time or no-loss */ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); return; } /* Calculate packets lost and delivered in sampling interval. */ lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; if ((delivered == 0) || (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) { bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time); return; } if (d_time < 1000) { /* Not enough time. wait */ /* 6 is not_enough time or no-loss */ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); return; } if (d_time >= (0xffffffff / USECS_IN_MSEC)) { /* Too long */ bbr_reset_lt_bw_sampling(bbr, cts); /* reason 3 is to reset sampling due too long of sampling */ bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); return; } del_time = d_time; bw = delivered; bw *= (uint64_t)USECS_IN_SECOND; bw /= del_time; bbr_lt_bw_samp_done(bbr, bw, cts, d_time); } /* * Allocate a sendmap from our zone. */ static struct bbr_sendmap * bbr_alloc(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; BBR_STAT_INC(bbr_to_alloc); rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO)); if (rsm) { bbr->r_ctl.rc_num_maps_alloced++; return (rsm); } if (bbr->r_ctl.rc_free_cnt) { BBR_STAT_INC(bbr_to_alloc_emerg); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); bbr->r_ctl.rc_free_cnt--; return (rsm); } BBR_STAT_INC(bbr_to_alloc_failed); return (NULL); } static struct bbr_sendmap * bbr_alloc_full_limit(struct tcp_bbr *bbr) { if ((V_tcp_map_entries_limit > 0) && (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { BBR_STAT_INC(bbr_alloc_limited); if (!bbr->alloc_limit_reported) { bbr->alloc_limit_reported = 1; BBR_STAT_INC(bbr_alloc_limited_conns); } return (NULL); } return (bbr_alloc(bbr)); } /* wrapper to allocate a sendmap entry, subject to a specific limit */ static struct bbr_sendmap * bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type) { struct bbr_sendmap *rsm; if (limit_type) { /* currently there is only one limit type */ if (V_tcp_map_split_limit > 0 && bbr->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { BBR_STAT_INC(bbr_split_limited); if (!bbr->alloc_limit_reported) { bbr->alloc_limit_reported = 1; BBR_STAT_INC(bbr_alloc_limited_conns); } return (NULL); } } /* allocate and mark in the limit type, if set */ rsm = bbr_alloc(bbr); if (rsm != NULL && limit_type) { rsm->r_limit_type = limit_type; bbr->r_ctl.rc_num_split_allocs++; } return (rsm); } static void bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) { if (rsm->r_limit_type) { /* currently there is only one limit type */ bbr->r_ctl.rc_num_split_allocs--; } if (rsm->r_is_smallmap) bbr->r_ctl.rc_num_small_maps_alloced--; if (bbr->r_ctl.rc_tlp_send == rsm) bbr->r_ctl.rc_tlp_send = NULL; if (bbr->r_ctl.rc_resend == rsm) { bbr->r_ctl.rc_resend = NULL; } if (bbr->r_ctl.rc_next == rsm) bbr->r_ctl.rc_next = NULL; if (bbr->r_ctl.rc_sacklast == rsm) bbr->r_ctl.rc_sacklast = NULL; if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { memset(rsm, 0, sizeof(struct bbr_sendmap)); TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); rsm->r_limit_type = 0; bbr->r_ctl.rc_free_cnt++; return; } bbr->r_ctl.rc_num_maps_alloced--; uma_zfree(bbr_zone, rsm); } /* * Returns the BDP. */ static uint64_t bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) { /* * Calculate the bytes in flight needed given the bw (in bytes per * second) and the specifyed rtt in useconds. We need to put out the * returned value per RTT to match that rate. Gain will normaly * raise it up from there. * * This should not overflow as long as the bandwidth is below 1 * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30). */ uint64_t usec_per_sec; usec_per_sec = USECS_IN_SECOND; return ((rtt * bw) / usec_per_sec); } /* * Return the initial cwnd. */ static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp) { uint32_t i_cwnd; if (bbr->rc_init_win) { i_cwnd = bbr->rc_init_win * tp->t_maxseg; } else if (V_tcp_initcwnd_segments) i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), max(2 * tp->t_maxseg, 14600)); else if (V_tcp_do_rfc3390) i_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (tp->t_maxseg > 2190) i_cwnd = 2 * tp->t_maxseg; else if (tp->t_maxseg > 1095) i_cwnd = 3 * tp->t_maxseg; else i_cwnd = 4 * tp->t_maxseg; } return (i_cwnd); } /* * Given a specified gain, return the target * cwnd based on that gain. */ static uint32_t bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw) { uint64_t bdp, rtt; uint32_t cwnd; if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) || (bbr_get_full_bw(bbr) == 0)) { /* No measurements yet */ return (bbr_initial_cwnd(bbr, bbr->rc_tp)); } /* * Get bytes per RTT needed (rttProp is normally in * bbr_cwndtarget_rtt_touse) */ rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse); /* Get the bdp from the two values */ bdp = bbr_get_bw_delay_prod(rtt, bw); /* Now apply the gain */ cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT)); return (cwnd); } static uint32_t bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain) { uint32_t cwnd, mss; mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); /* Get the base cwnd with gain rounded to a mss */ cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss); /* * Add in N (2 default since we do not have a * fq layer to trap packets in) quanta's per the I-D * section 4.2.3.2 quanta adjust. */ cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs); if (bbr->rc_use_google) { if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && (bbr_state_val(bbr) == BBR_SUB_GAIN)) { /* * The linux implementation adds * an extra 2 x mss in gain cycle which * is documented no-where except in the code. * so we add more for Neal undocumented feature */ cwnd += 2 * mss; } if ((cwnd / mss) & 0x1) { /* Round up for odd num mss */ cwnd += mss; } } /* Are we below the min cwnd? */ if (cwnd < get_min_cwnd(bbr)) return (get_min_cwnd(bbr)); return (cwnd); } static uint16_t bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain) { if (gain < 1) gain = 1; return (gain); } static uint32_t bbr_get_header_oh(struct tcp_bbr *bbr) { int seg_oh; seg_oh = 0; if (bbr->r_ctl.rc_inc_tcp_oh) { /* Do we include TCP overhead? */ seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr)); } if (bbr->r_ctl.rc_inc_ip_oh) { /* Do we include IP overhead? */ #ifdef INET6 if (bbr->r_is_v6) seg_oh += sizeof(struct ip6_hdr); else #endif #ifdef INET seg_oh += sizeof(struct ip); #endif } if (bbr->r_ctl.rc_inc_enet_oh) { /* Do we include the ethernet overhead? */ seg_oh += sizeof(struct ether_header); } return(seg_oh); } static uint32_t bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw) { uint64_t divor, res, tim; if (useconds_time == 0) return (0); gain = bbr_gain_adjust(bbr, gain); divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT; tim = useconds_time; res = (tim * bw * gain) / divor; if (res == 0) res = 1; return ((uint32_t)res); } /* * Given a gain and a length return the delay in useconds that * should be used to evenly space out packets * on the connection (based on the gain factor). */ static uint32_t bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog) { uint64_t bw, lentim, res; uint32_t usecs, srtt, over = 0; uint32_t seg_oh, num_segs, maxseg; if (len == 0) return (0); maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; num_segs = (len + maxseg - 1) / maxseg; if (bbr->rc_use_google == 0) { seg_oh = bbr_get_header_oh(bbr); len += (num_segs * seg_oh); } gain = bbr_gain_adjust(bbr, gain); bw = bbr_get_bw(bbr); if (bbr->rc_use_google) { uint64_t cbw; /* * Reduce the b/w by the google discount * factor 10 = 1%. */ cbw = bw * (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount); cbw /= (uint64_t)1000; /* We don't apply a discount if it results in 0 */ if (cbw > 0) bw = cbw; } lentim = ((uint64_t)len * (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT); res = lentim / ((uint64_t)gain * bw); if (res == 0) res = 1; usecs = (uint32_t)res; srtt = bbr_get_rtt(bbr, BBR_SRTT); if (bbr_hptsi_max_mul && bbr_hptsi_max_div && (bbr->rc_use_google == 0) && (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) { /* * We cannot let the delay be more than 1/2 the srtt time. * Otherwise we cannot pace out or send properly. */ over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div; BBR_STAT_INC(bbr_hpts_min_time); } if (!nolog) bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1); return (usecs); } static void bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack, uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses) { INP_WLOCK_ASSERT(tp->t_inpcb); uint64_t bw; uint32_t cwnd, target_cwnd, saved_bytes, maxseg; int32_t meth; #ifdef STATS if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { /* * Strech acks and compressed acks will cause this to * oscillate but we are doing it the same way as the main * stack so it will be compariable (though possibly not * ideal). */ int32_t cgput; int64_t gput, time_stamp; gput = (int64_t) (th->th_ack - tp->gput_seq) * 8; time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000)); cgput = gput / time_stamp; stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, cgput); if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = cgput; } #endif if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { /* We don't change anything in probe-rtt */ return; } maxseg = tp->t_maxseg - bbr->rc_last_options; saved_bytes = bytes_this_ack; bytes_this_ack += sack_changed; if (bytes_this_ack > prev_acked) { bytes_this_ack -= prev_acked; /* * A byte ack'd gives us a full mss * to be like linux i.e. they count packets. */ if ((bytes_this_ack < maxseg) && bbr->rc_use_google) bytes_this_ack = maxseg; } else { /* Unlikely */ bytes_this_ack = 0; } cwnd = tp->snd_cwnd; bw = get_filter_value(&bbr->r_ctl.rc_delrate); if (bw) target_cwnd = bbr_get_target_cwnd(bbr, bw, (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain); else target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp); if (IN_RECOVERY(tp->t_flags) && (bbr->bbr_prev_in_rec == 0)) { /* * We are entering recovery and * thus packet conservation. */ bbr->pkt_conservation = 1; bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime; cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + bytes_this_ack; } if (IN_RECOVERY(tp->t_flags)) { uint32_t flight; bbr->bbr_prev_in_rec = 1; if (cwnd > losses) { cwnd -= losses; if (cwnd < maxseg) cwnd = maxseg; } else cwnd = maxseg; flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr_log_type_cwndupd(bbr, flight, 0, losses, 10, 0, 0, line); if (bbr->pkt_conservation) { uint32_t time_in; if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start)) time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start; else time_in = 0; if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { /* Clear packet conservation after an rttProp */ bbr->pkt_conservation = 0; } else { if ((flight + bytes_this_ack) > cwnd) cwnd = flight + bytes_this_ack; if (cwnd < get_min_cwnd(bbr)) cwnd = get_min_cwnd(bbr); tp->snd_cwnd = cwnd; bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, 1, target_cwnd, th->th_ack, line); return; } } } else bbr->bbr_prev_in_rec = 0; if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) { bbr->r_ctl.restrict_growth--; if (bytes_this_ack > maxseg) bytes_this_ack = maxseg; } if (bbr->rc_filled_pipe) { /* * Here we have exited startup and filled the pipe. We will * thus allow the cwnd to shrink to the target. We hit here * mostly. */ uint32_t s_cwnd; meth = 2; s_cwnd = min((cwnd + bytes_this_ack), target_cwnd); if (s_cwnd > cwnd) cwnd = s_cwnd; else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing) cwnd = s_cwnd; } else { /* * Here we are still in startup, we increase cwnd by what * has been acked. */ if ((cwnd < target_cwnd) || (bbr->rc_past_init_win == 0)) { meth = 3; cwnd += bytes_this_ack; } else { /* * Method 4 means we are at target so no gain in * startup and past the initial window. */ meth = 4; } } tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr)); bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line); } static void tcp_bbr_partialack(struct tcpcb *tp) { struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= tp->snd_cwnd) { bbr->r_wanted_output = 1; } } static void bbr_post_recovery(struct tcpcb *tp) { struct tcp_bbr *bbr; uint32_t flight; INP_WLOCK_ASSERT(tp->t_inpcb); bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* * Here we just exit recovery. */ EXIT_RECOVERY(tp->t_flags); /* Lock in our b/w reduction for the specified number of pkt-epochs */ bbr->r_recovery_bw = 0; tp->snd_recover = tp->snd_una; tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); bbr->pkt_conservation = 0; if (bbr->rc_use_google == 0) { /* * For non-google mode lets * go ahead and make sure we clear * the recovery state so if we * bounce back in to recovery we * will do PC. */ bbr->bbr_prev_in_rec = 0; } bbr_log_type_exit_rec(bbr); if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__); } else { /* For probe-rtt case lets fix up its saved_cwnd */ if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) { bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent; bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__); } } flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if ((bbr->rc_use_google == 0) && bbr_do_red) { uint64_t val, lr2use; uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd; uint32_t *cwnd_p; if (bbr_get_rtt(bbr, BBR_SRTT)) { val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000); val /= bbr_get_rtt(bbr, BBR_SRTT); ratio = (uint32_t)val; } else ratio = 1000; bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, bbr->r_ctl.recovery_lr, 21, ratio, bbr->r_ctl.rc_red_cwnd_pe, __LINE__); if ((ratio < bbr_do_red) || (bbr_do_red == 0)) goto done; if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && bbr_prtt_slam_cwnd) || (bbr_sub_drain_slam_cwnd && (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && bbr->rc_hit_state_1 && (bbr_state_val(bbr) == BBR_SUB_DRAIN)) || ((bbr->rc_bbr_state == BBR_STATE_DRAIN) && bbr_slam_cwnd_in_main_drain)) { /* * Here we must poke at the saved cwnd * as well as the cwnd. */ cwnd = bbr->r_ctl.rc_saved_cwnd; cwnd_p = &bbr->r_ctl.rc_saved_cwnd; } else { cwnd = tp->snd_cwnd; cwnd_p = &tp->snd_cwnd; } maxseg = tp->t_maxseg - bbr->rc_last_options; /* Add the overall lr with the recovery lr */ if (bbr->r_ctl.rc_lost == 0) lr2use = 0; else if (bbr->r_ctl.rc_delivered == 0) lr2use = 1000; else { lr2use = bbr->r_ctl.rc_lost * 1000; lr2use /= bbr->r_ctl.rc_delivered; } lr2use += bbr->r_ctl.recovery_lr; acks_inflight = (flight / (maxseg * 2)); if (bbr_red_scale) { lr2use *= bbr_get_rtt(bbr, BBR_SRTT); lr2use /= bbr_red_scale; if ((bbr_red_growth_restrict) && ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1)) bbr->r_ctl.restrict_growth += acks_inflight; } if (lr2use) { val = (uint64_t)cwnd * lr2use; val /= 1000; if (cwnd > val) newcwnd = roundup((cwnd - val), maxseg); else newcwnd = maxseg; } else { val = (uint64_t)cwnd * (uint64_t)bbr_red_mul; val /= (uint64_t)bbr_red_div; newcwnd = roundup((uint32_t)val, maxseg); } /* with standard delayed acks how many acks can I expect? */ if (bbr_drop_limit == 0) { /* * Anticpate how much we will * raise the cwnd based on the acks. */ if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) { /* We do enforce the min (with the acks) */ newcwnd = (get_min_cwnd(bbr) - acks_inflight); } } else { /* * A strict drop limit of N is is inplace */ if (newcwnd < (bbr_drop_limit * maxseg)) { newcwnd = bbr_drop_limit * maxseg; } } /* For the next N acks do we restrict the growth */ *cwnd_p = newcwnd; if (tp->snd_cwnd > newcwnd) tp->snd_cwnd = newcwnd; bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22, (uint32_t)lr2use, bbr_get_rtt(bbr, BBR_SRTT), __LINE__); bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch; } done: bbr->r_ctl.recovery_lr = 0; if (flight <= tp->snd_cwnd) { bbr->r_wanted_output = 1; } tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); } static void bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts) { bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate); /* Limit the drop in b/w to 1/2 our current filter. */ if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate) bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate; if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2)) bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2; tcp_bbr_tso_size_check(bbr, cts); } static void bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm) { struct tcp_bbr *bbr; INP_WLOCK_ASSERT(tp->t_inpcb); bbr = (struct tcp_bbr *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: if (!IN_RECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; /* Start a new epoch */ bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__); if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) { /* * Move forward the lt epoch * so it won't count the truncated * epoch. */ bbr->r_ctl.rc_lt_epoch++; } if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { /* * Just like the policer detection code * if we are in startup we must push * forward the last startup epoch * to hide the truncated PE. */ bbr->r_ctl.rc_bbr_last_startup_epoch++; } bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd; ENTER_RECOVERY(tp->t_flags); bbr->rc_tlp_rtx_out = 0; bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate; tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); if (bbr->rc_inp->inp_in_hpts && ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) { /* * When we enter recovery, we need to restart * any timers. This may mean we gain an agg * early, which will be made up for at the last * rxt out. */ bbr->rc_timer_first = 1; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } /* * Calculate a new cwnd based on to the current * delivery rate with no gain. We get the bdp * without gaining it up like we normally would and * we use the last cur_del_rate. */ if ((bbr->rc_use_google == 0) && (bbr->r_ctl.bbr_rttprobe_gain_val || (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) { tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + (tp->t_maxseg - bbr->rc_last_options); if (tp->snd_cwnd < get_min_cwnd(bbr)) { /* We always gate to min cwnd */ tp->snd_cwnd = get_min_cwnd(bbr); } bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__); } bbr_log_type_enter_rec(bbr, rsm->r_start); } break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime); if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__); } tp->t_badrxtwin = 0; break; } } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. * - The data being acked is less than a full segment (a stretch ack * of more than a segment we should ack. * - nsegs is 1 (if its more than that we received more than 1 ack). */ #define DELAY_ACK(tp, bbr, nsegs) \ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) /* * Return the lowest RSM in the map of * packets still in flight that is not acked. * This should normally find on the first one * since we remove packets from the send * map after they are marked ACKED. */ static struct bbr_sendmap * bbr_find_lowest_rsm(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; /* * Walk the time-order transmitted list looking for an rsm that is * not acked. This will be the one that was sent the longest time * ago that is still outstanding. */ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) { if (rsm->r_flags & BBR_ACKED) { continue; } goto finish; } finish: return (rsm); } static struct bbr_sendmap * bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) { struct bbr_sendmap *prsm; /* * Walk the sequence order list backward until we hit and arrive at * the highest seq not acked. In theory when this is called it * should be the last segment (which it was not). */ prsm = rsm; TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) { continue; } return (prsm); } return (NULL); } /* * Returns to the caller the number of microseconds that * the packet can be outstanding before we think we * should have had an ack returned. */ static uint32_t bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm) { /* * lro is the flag we use to determine if we have seen reordering. * If it gets set we have seen reordering. The reorder logic either * works in one of two ways: * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as * passed we no longer consider reordering has occuring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro * to 1. * * In the end if lro is non-zero we add the extra time for * reordering in. */ int32_t lro; uint32_t thresh, t_rxtcur; if (srtt == 0) srtt = 1; if (bbr->r_ctl.rc_reorder_ts) { if (bbr->r_ctl.rc_reorder_fade) { if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) { lro = cts - bbr->r_ctl.rc_reorder_ts; if (lro == 0) { /* * No time as passed since the last * reorder, mark it as reordering. */ lro = 1; } } else { /* Negative time? */ lro = 0; } if (lro > bbr->r_ctl.rc_reorder_fade) { /* Turn off reordering seen too */ bbr->r_ctl.rc_reorder_ts = 0; lro = 0; } } else { /* Reodering does not fade */ lro = 1; } } else { lro = 0; } thresh = srtt + bbr->r_ctl.rc_pkt_delay; if (lro) { /* It must be set, if not you get 1/4 rtt */ if (bbr->r_ctl.rc_reorder_shift) thresh += (srtt >> bbr->r_ctl.rc_reorder_shift); else thresh += (srtt >> 2); } else { thresh += 1000; } /* We don't let the rack timeout be above a RTO */ if ((bbr->rc_tp)->t_srtt == 0) t_rxtcur = BBR_INITIAL_RTO; else t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); if (thresh > t_rxtcur) { thresh = t_rxtcur; } /* And we don't want it above the RTO max either */ if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); } bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK); return (thresh); } /* * Return to the caller the amount of time in mico-seconds * that should be used for the TLP timer from the last * send time of this packet. */ static uint32_t bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t srtt, uint32_t cts) { uint32_t thresh, len, maxseg, t_rxtcur; struct bbr_sendmap *prsm; if (srtt == 0) srtt = 1; if (bbr->rc_tlp_threshold) thresh = srtt + (srtt / bbr->rc_tlp_threshold); else thresh = (srtt * 2); maxseg = tp->t_maxseg - bbr->rc_last_options; /* Get the previous sent packet, if any */ len = rsm->r_end - rsm->r_start; /* 2.1 behavior */ prsm = TAILQ_PREV(rsm, bbr_head, r_tnext); if (prsm && (len <= maxseg)) { /* * Two packets outstanding, thresh should be (2*srtt) + * possible inter-packet delay (if any). */ uint32_t inter_gap = 0; int idx, nidx; idx = rsm->r_rtr_cnt - 1; nidx = prsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { /* Yes it was sent later (or at the same time) */ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; } else if (len <= maxseg) { /* * Possibly compensate for delayed-ack. */ uint32_t alt_thresh; alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } /* Not above the current RTO */ if (tp->t_srtt == 0) t_rxtcur = BBR_INITIAL_RTO; else t_rxtcur = TICKS_2_USEC(tp->t_rxtcur); bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP); /* Not above an RTO */ if (thresh > t_rxtcur) { thresh = t_rxtcur; } /* Not above a RTO max */ if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); } /* And now apply the user TLP min */ if (thresh < bbr_tlp_min) { thresh = bbr_tlp_min; } return (thresh); } /* * Return one of three RTTs to use (in microseconds). */ static __inline uint32_t bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type) { uint32_t f_rtt; uint32_t srtt; f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) { /* We have no rtt at all */ if (bbr->rc_tp->t_srtt == 0) f_rtt = BBR_INITIAL_RTO; else f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); /* * Since we don't know how good the rtt is apply a * delayed-ack min */ if (f_rtt < bbr_delayed_ack_time) { f_rtt = bbr_delayed_ack_time; } } /* Take the filter version or last measured pkt-rtt */ if (rtt_type == BBR_RTT_PROP) { srtt = f_rtt; } else if (rtt_type == BBR_RTT_PKTRTT) { if (bbr->r_ctl.rc_pkt_epoch_rtt) { srtt = bbr->r_ctl.rc_pkt_epoch_rtt; } else { /* No pkt rtt yet */ srtt = f_rtt; } } else if (rtt_type == BBR_RTT_RACK) { srtt = bbr->r_ctl.rc_last_rtt; /* We need to add in any internal delay for our timer */ if (bbr->rc_ack_was_delayed) srtt += bbr->r_ctl.rc_ack_hdwr_delay; } else if (rtt_type == BBR_SRTT) { srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); } else { /* TSNH */ srtt = f_rtt; #ifdef BBR_INVARIANTS panic("Unknown rtt request type %d", rtt_type); #endif } return (srtt); } static int bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts) { uint32_t thresh; thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK), cts, rsm); if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) { /* It is lost (past time) */ return (1); } return (0); } /* * Return a sendmap if we need to retransmit something. */ static struct bbr_sendmap * bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * Check to see that we don't need to fall into recovery. We will * need to do so if our oldest transmit is past the time we should * have had an ack. */ struct bbr_sendmap *rsm; int32_t idx; if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) { /* Nothing outstanding that we know of */ return (NULL); } rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (rsm == NULL) { /* Nothing in the transmit map */ return (NULL); } if (tp->t_flags & TF_SENTFIN) { /* Fin restricted, don't find anything once a fin is sent */ return (NULL); } if (rsm->r_flags & BBR_ACKED) { /* * Ok the first one is acked (this really should not happen * since we remove the from the tmap once they are acked) */ rsm = bbr_find_lowest_rsm(bbr); if (rsm == NULL) return (NULL); } idx = rsm->r_rtr_cnt - 1; if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) { /* Send timestamp is the same or less? can't be ready */ return (NULL); } /* Get our RTT time */ if (bbr_is_lost(bbr, rsm, cts) && ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || (rsm->r_flags & BBR_SACK_PASSED))) { if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { rsm->r_flags |= BBR_MARKED_LOST; bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; } bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm); #ifdef BBR_INVARIANTS if ((rsm->r_end - rsm->r_start) == 0) panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm); #endif return (rsm); } return (NULL); } /* * RACK Timer, here we simply do logging and house keeping. * the normal bbr_output_wtime() function will call the * appropriate thing to check if we need to do a RACK retransmit. * We return 1, saying don't proceed with bbr_output_wtime only * when all timers have been stopped (destroyed PCB?). */ static int bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * This timer simply provides an internal trigger to send out data. * The check_recovery_mode call will see if there are needed * retransmissions, if so we will enter fast-recovery. The output * call may or may not do the same thing depending on sysctl * settings. */ uint32_t lost; if (bbr->rc_all_timers_stopped) { return (1); } if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } BBR_STAT_INC(bbr_to_tot); lost = bbr->r_ctl.rc_lost; if (bbr->r_state && (bbr->r_state != tp->t_state)) bbr_set_state(tp, bbr, 0); bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK); if (bbr->r_ctl.rc_resend == NULL) { /* Lets do the check here */ bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); } if (bbr_policer_call_from_rack_to) bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; return (0); } static __inline void bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start) { int idx; nrsm->r_start = start; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; /* We don't transfer forward the SYN flag */ nrsm->r_flags &= ~BBR_HAS_SYN; /* We move forward the FIN flag, not that this should happen */ rsm->r_flags &= ~BBR_HAS_FIN; nrsm->r_dupack = rsm->r_dupack; nrsm->r_rtr_bytes = 0; nrsm->r_is_gain = rsm->r_is_gain; nrsm->r_is_drain = rsm->r_is_drain; nrsm->r_delivered = rsm->r_delivered; nrsm->r_ts_valid = rsm->r_ts_valid; nrsm->r_del_ack_ts = rsm->r_del_ack_ts; nrsm->r_del_time = rsm->r_del_time; nrsm->r_app_limited = rsm->r_app_limited; nrsm->r_first_sent_time = rsm->r_first_sent_time; nrsm->r_flight_at_send = rsm->r_flight_at_send; /* We split a piece the lower section looses any just_ret flag. */ nrsm->r_bbr_state = rsm->r_bbr_state; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); idx /= 8; /* Check if we got too small */ if ((rsm->r_is_smallmap == 0) && ((rsm->r_end - rsm->r_start) <= idx)) { bbr->r_ctl.rc_num_small_maps_alloced++; rsm->r_is_smallmap = 1; } /* Check the new one as well */ if ((nrsm->r_end - nrsm->r_start) <= idx) { bbr->r_ctl.rc_num_small_maps_alloced++; nrsm->r_is_smallmap = 1; } } static int bbr_sack_mergable(struct bbr_sendmap *at, uint32_t start, uint32_t end) { /* * Given a sack block defined by * start and end, and a current postion * at. Return 1 if either side of at * would show that the block is mergable * to that side. A block to be mergable * must have overlap with the start/end * and be in the SACK'd state. */ struct bbr_sendmap *l_rsm; struct bbr_sendmap *r_rsm; /* first get the either side blocks */ l_rsm = TAILQ_PREV(at, bbr_head, r_next); r_rsm = TAILQ_NEXT(at, r_next); if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) { /* Potentially mergeable */ if ((l_rsm->r_end == start) || (SEQ_LT(start, l_rsm->r_end) && SEQ_GT(end, l_rsm->r_end))) { /* * map blk |------| * sack blk |------| * * map blk |------| * sack blk |------| */ return (1); } } if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) { /* Potentially mergeable */ if ((r_rsm->r_start == end) || (SEQ_LT(start, r_rsm->r_start) && SEQ_GT(end, r_rsm->r_start))) { /* * map blk |---------| * sack blk |----| * * map blk |---------| * sack blk |-------| */ return (1); } } return (0); } static struct bbr_sendmap * bbr_merge_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *l_rsm, struct bbr_sendmap *r_rsm) { /* * We are merging two ack'd RSM's, * the l_rsm is on the left (lower seq * values) and the r_rsm is on the right * (higher seq value). The simplest way * to merge these is to move the right * one into the left. I don't think there * is any reason we need to try to find * the oldest (or last oldest retransmitted). */ l_rsm->r_end = r_rsm->r_end; if (l_rsm->r_dupack < r_rsm->r_dupack) l_rsm->r_dupack = r_rsm->r_dupack; if (r_rsm->r_rtr_bytes) l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; if (r_rsm->r_in_tmap) { /* This really should not happen */ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext); } if (r_rsm->r_app_limited) l_rsm->r_app_limited = r_rsm->r_app_limited; /* Now the flags */ if (r_rsm->r_flags & BBR_HAS_FIN) l_rsm->r_flags |= BBR_HAS_FIN; if (r_rsm->r_flags & BBR_TLP) l_rsm->r_flags |= BBR_TLP; if (r_rsm->r_flags & BBR_RWND_COLLAPSED) l_rsm->r_flags |= BBR_RWND_COLLAPSED; if (r_rsm->r_flags & BBR_MARKED_LOST) { /* This really should not happen */ bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start; } TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next); if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { /* Transfer the split limit to the map we free */ r_rsm->r_limit_type = l_rsm->r_limit_type; l_rsm->r_limit_type = 0; } bbr_free(bbr, r_rsm); return(l_rsm); } /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal bbr_output_wtime() will then * send it out. * * We return 1, saying don't proceed with bbr_output_wtime only * when all timers have been stopped (destroyed PCB?). */ static int bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * Tail Loss Probe. */ struct bbr_sendmap *rsm = NULL; struct socket *so; uint32_t amm; uint32_t out, avail; uint32_t maxseg; int collapsed_win = 0; if (bbr->rc_all_timers_stopped) { return (1); } if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } if (bbr_progress_timeout_check(bbr)) { tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); return (1); } /* Did we somehow get into persists? */ if (bbr->rc_in_persist) { return (0); } if (bbr->r_state && (bbr->r_state != tp->t_state)) bbr_set_state(tp, bbr, 0); BBR_STAT_INC(bbr_tlp_tot); maxseg = tp->t_maxseg - bbr->rc_last_options; #ifdef KERN_TLS if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { /* * For hardware TLS we do *not* want to send * new data. */ goto need_retran; } #endif /* * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ so = tp->t_inpcb->inp_socket; avail = sbavail(&so->so_snd); out = ctf_outstanding(tp); if (out > tp->snd_wnd) { /* special case, we need a retransmission */ collapsed_win = 1; goto need_retran; } if (avail > out) { /* New data is available */ amm = avail - out; if (amm > maxseg) { amm = maxseg; } else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { /* not enough to fill a MTU and no-delay is off */ goto need_retran; } /* Set the send-new override */ if ((out + amm) <= tp->snd_wnd) { bbr->rc_tlp_new_data = 1; } else { goto need_retran; } bbr->r_ctl.rc_tlp_seg_send_cnt = 0; bbr->r_ctl.rc_last_tlp_seq = tp->snd_max; bbr->r_ctl.rc_tlp_send = NULL; /* cap any slots */ BBR_STAT_INC(bbr_tlp_newdata); goto send; } need_retran: /* * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ if (collapsed_win == 0) { rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); if (rsm && (BBR_ACKED | BBR_HAS_FIN)) { rsm = bbr_find_high_nonack(bbr, rsm); } if (rsm == NULL) { goto restore; } } else { /* * We must find the last segment * that was acceptable by the client. */ TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) { /* Found one */ break; } } if (rsm == NULL) { /* None? if so send the first */ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm == NULL) goto restore; } } if ((rsm->r_end - rsm->r_start) > maxseg) { /* * We need to split this the last segment in two. */ struct bbr_sendmap *nrsm; nrsm = bbr_alloc_full_limit(bbr); if (nrsm == NULL) { /* * We can't get memory to split, we can either just * not split it. Or retransmit the whole piece, lets * do the large send (BTLP :-) ). */ goto go_for_it; } bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg)); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); rsm = nrsm; } go_for_it: bbr->r_ctl.rc_tlp_send = rsm; bbr->rc_tlp_rtx_out = 1; if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) { bbr->r_ctl.rc_tlp_seg_send_cnt++; tp->t_rxtshift++; } else { bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; bbr->r_ctl.rc_tlp_seg_send_cnt = 1; } send: if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { /* * Can't [re]/transmit a segment we have retranmitted the * max times. We need the retransmit timer to take over. */ restore: bbr->rc_tlp_new_data = 0; bbr->r_ctl.rc_tlp_send = NULL; if (rsm) rsm->r_flags &= ~BBR_TLP; BBR_STAT_INC(bbr_tlp_retran_fail); return (0); } else if (rsm) { rsm->r_flags |= BBR_TLP; } if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) && (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) { /* * We have retransmitted to many times for TLP. Switch to * the regular RTO timer */ goto restore; } bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } /* * Delayed ack Timer, here we simply need to setup the * ACK_NOW flag and remove the DELACK flag. From there * the output routine will send the ack out. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { if (bbr->rc_all_timers_stopped) { return (1); } bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; return (0); } /* * Persists timer, here we simply need to setup the * FORCE-DATA flag the output routine will send * the one byte send. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { struct tcptemp *t_template; int32_t retval = 1; if (bbr->rc_all_timers_stopped) { return (1); } if (bbr->rc_in_persist == 0) return (0); KASSERT(tp->t_inpcb != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); /* * Persistence timer into zero window. Force a byte to be output, if * possible. */ bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; TCPSTAT_INC(tcps_persisttimeo); /* * Have we exceeded the user specified progress time? */ if (bbr_progress_timeout_check(bbr)) { tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } /* * Hack: if the peer is dead/unreachable, we do not time out if the * window is closed. After a full backoff, drop the connection if * the idle time (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) { bbr_exit_persist(tp, bbr, cts, __LINE__); retval = 0; goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } t_template = tcpip_maketemplate(bbr->rc_inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); /* This sends an ack */ if (tp->t_flags & TF_DELACK) tp->t_flags &= ~TF_DELACK; free(t_template, M_TEMP); } if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0); out: return (retval); } /* * If a keepalive goes off, we had no other timers * happening. We always return 1 here since this * routine either drops the connection or sends * out a segment with respond. */ static int bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { struct tcptemp *t_template; struct inpcb *inp; if (bbr->rc_all_timers_stopped) { return (1); } bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response if the peer is * up and reachable: either an ACK if the connection is * still alive, or an RST if the peer has closed the * connection due to timeout or reboot. Using sequence * number tp->snd_una-1 causes the transmitted zero-length * segment to lie outside the receive window; by the * protocol spec, this requires the correspondent TCP to * respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } } bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0); return (1); dropit: TCPSTAT_INC(tcps_keepdrops); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); return (1); } /* * Retransmit helper function, clear up all the ack * flags and take care of important book keeping. */ static void bbr_remxt_tmr(struct tcpcb *tp) { /* * The retransmit timer went off, all sack'd blocks must be * un-acked. */ struct bbr_sendmap *rsm, *trsm = NULL; struct tcp_bbr *bbr; uint32_t cts, lost; bbr = (struct tcp_bbr *)tp->t_fb_ptr; cts = tcp_get_usecs(&bbr->rc_tv); lost = bbr->r_ctl.rc_lost; if (bbr->r_state && (bbr->r_state != tp->t_state)) bbr_set_state(tp, bbr, 0); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { if (rsm->r_flags & BBR_ACKED) { uint32_t old_flags; rsm->r_dupack = 0; if (rsm->r_in_tmap == 0) { /* We must re-add it back to the tlist */ if (trsm == NULL) { TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); } else { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; } old_flags = rsm->r_flags; rsm->r_flags |= BBR_RXT_CLEARED; rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS); bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); } else { if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; } if (bbr_marks_rxt_sack_passed) { /* * With this option, we will rack out * in 1ms increments the rest of the packets. */ rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST; rsm->r_flags &= ~BBR_WAS_SACKPASS; } else { /* * With this option we only mark them lost * and remove all sack'd markings. We will run * another RXT or a TLP. This will cause * us to eventually send more based on what * ack's come in. */ rsm->r_flags |= BBR_MARKED_LOST; rsm->r_flags &= ~BBR_WAS_SACKPASS; rsm->r_flags &= ~BBR_SACK_PASSED; } } trsm = rsm; } bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map); /* Clear the count (we just un-acked them) */ bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR); bbr->rc_tlp_new_data = 0; bbr->r_ctl.rc_tlp_seg_send_cnt = 0; /* zap the behindness on a rxt */ bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; bbr->rc_tlp_rtx_out = 0; bbr->r_ctl.rc_sacked = 0; bbr->r_ctl.rc_sacklast = NULL; bbr->r_timer_override = 1; bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); } /* * Re-transmit timeout! If we drop the PCB we will return 1, otherwise * we will setup to retransmit the lowest seq number outstanding. */ static int bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { int32_t rexmt; int32_t retval = 0; bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; if (bbr->rc_all_timers_stopped) { return (1); } if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_una == tp->snd_max)) { /* Nothing outstanding .. nothing to do */ return (0); } /* * Retransmission timer went off. Message has not been acked within * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ if (bbr_progress_timeout_check(bbr)) { retval = 1; tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } bbr_remxt_tmr(tp); if ((bbr->r_ctl.rc_resend == NULL) || ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) { /* * If the rwnd collapsed on * the one we are retransmitting * it does not count against the * rxt count. */ tp->t_rxtshift++; } if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); retval = 1; tcp_set_inp_to_drop(bbr->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited * to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can be * recovered if this turns out to be a "bad" retransmit. A * retransmit is considered "bad" if an ACK for this segment * is received within RTT/2 interval; the assumption here is * that the ACK was already in flight. See "On Estimating * End-to-End Network Path Properties" by Allman and Paxson * for more details. */ tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; if (!IN_RECOVERY(tp->t_flags)) { tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else { tp->t_flags &= ~TF_PREVVALID; } tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; } else { tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; tp->t_flags &= ~TF_PREVVALID; } TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple * of packets and process straight to FIN. In that case we won't * catch ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int32_t isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, * 1448 -> 1188 -> 524) should be given 2 chances to recover * before further clamping down. 'tp->t_rxtshift % 2 == 0' * should take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: - * Disable Path MTU Discovery (IP "DF" bit). - * Reduce MTU to lower value than what we negotiated * with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* * Record that we may have found a black * hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the * default in an attempt to retransmit. */ #ifdef INET6 isipv6 = bbr->r_is_v6; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole * and we restore the previous MSS and blackhole * detection flags. The limit '6' is determined by * giving each probe stage (1448, 1188, 524) 2 * chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } } /* * Disable RFC1323 and SACK if we haven't got any response to our * third SYN to work-around some broken terminal servers (most of * which have hopefully been retired) that have bad VJ header * compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit * times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if (bbr->r_is_v6) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; out: return (retval); } static int bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling) { int32_t ret = 0; int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK); if (timers == 0) { return (0); } if (tp->t_state == TCPS_LISTEN) { /* no timers on listen sockets */ if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) return (0); return (1); } if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { uint32_t left; if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { ret = -1; bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); return (0); } if (hpts_calling == 0) { ret = -2; bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); return (0); } /* * Ok our timer went off early and we are not paced false * alarm, go back to sleep. */ left = bbr->r_ctl.rc_timer_exp - cts; ret = -3; bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left)); return (1); } bbr->rc_tmr_stopped = 0; bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; if (timers & PACE_TMR_DELACK) { ret = bbr_timeout_delack(tp, bbr, cts); } else if (timers & PACE_TMR_PERSIT) { ret = bbr_timeout_persist(tp, bbr, cts); } else if (timers & PACE_TMR_RACK) { bbr->r_ctl.rc_tlp_rxt_last_time = cts; ret = bbr_timeout_rack(tp, bbr, cts); } else if (timers & PACE_TMR_TLP) { bbr->r_ctl.rc_tlp_rxt_last_time = cts; ret = bbr_timeout_tlp(tp, bbr, cts); } else if (timers & PACE_TMR_RXT) { bbr->r_ctl.rc_tlp_rxt_last_time = cts; ret = bbr_timeout_rxt(tp, bbr, cts); } else if (timers & PACE_TMR_KEEP) { ret = bbr_timeout_keepalive(tp, bbr, cts); } bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling); return (ret); } static void bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) { if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { uint8_t hpts_removed = 0; if (bbr->rc_inp->inp_in_hpts && (bbr->rc_timer_first == 1)) { /* * If we are canceling timer's when we have the * timer ahead of the output being paced. We also * must remove ourselves from the hpts. */ hpts_removed = 1; tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); if (bbr->r_ctl.rc_last_delay_val) { /* Update the last hptsi delay too */ uint32_t time_since_send; if (TSTMP_GT(cts, bbr->rc_pacer_started)) time_since_send = cts - bbr->rc_pacer_started; else time_since_send = 0; if (bbr->r_ctl.rc_last_delay_val > time_since_send) { /* Cut down our slot time */ bbr->r_ctl.rc_last_delay_val -= time_since_send; } else { bbr->r_ctl.rc_last_delay_val = 0; } bbr->rc_pacer_started = cts; } } bbr->rc_timer_first = 0; bbr_log_to_cancel(bbr, line, cts, hpts_removed); bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); } } static void bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rc_all_timers_stopped = 1; return; } /* * stop all timers always returning 0. */ static int bbr_stopall(struct tcpcb *tp) { return (0); } static void bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) { return; } /* * return true if a bbr timer (rack or tlp) is active. */ static int bbr_timer_active(struct tcpcb *tp, uint32_t timer_type) { return (0); } static uint32_t bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts) { struct bbr_sendmap *rsm; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if ((rsm == NULL) || (u_rsm == rsm)) return (cts); return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); } static void bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time) { int32_t idx; rsm->r_rtr_cnt++; rsm->r_dupack = 0; if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) { rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS; rsm->r_flags |= BBR_OVERMAX; } if (rsm->r_flags & BBR_RWND_COLLAPSED) { /* Take off the collapsed flag at rxt */ rsm->r_flags &= ~BBR_RWND_COLLAPSED; } if (rsm->r_flags & BBR_MARKED_LOST) { /* We have retransmitted, its no longer lost */ rsm->r_flags &= ~BBR_MARKED_LOST; bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } if (rsm->r_flags & BBR_RXT_CLEARED) { /* * We hit a RXT timer on it and * we cleared the "acked" flag. * We now have it going back into * flight, we can remove the cleared * flag and possibly do accounting on * this piece. */ rsm->r_flags &= ~BBR_RXT_CLEARED; } if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) { bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = cts; rsm->r_pacing_delay = pacing_time; rsm->r_delivered = bbr->r_ctl.rc_delivered; rsm->r_ts_valid = bbr->rc_ts_valid; if (bbr->rc_ts_valid) rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; if (bbr->r_ctl.r_app_limited_until) rsm->r_app_limited = 1; else rsm->r_app_limited = 0; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) rsm->r_bbr_state = bbr_state_val(bbr); else rsm->r_bbr_state = 8; if (rsm->r_flags & BBR_ACKED) { /* Problably MTU discovery messing with us */ uint32_t old_flags; old_flags = rsm->r_flags; rsm->r_flags &= ~BBR_ACKED; bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); if (bbr->r_ctl.rc_sacked == 0) bbr->r_ctl.rc_sacklast = NULL; } if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); } TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (rsm->r_flags & BBR_SACK_PASSED) { /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~BBR_SACK_PASSED; rsm->r_flags |= BBR_WAS_SACKPASS; } rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { rsm->r_is_gain = 1; rsm->r_is_drain = 0; } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { rsm->r_is_drain = 1; rsm->r_is_gain = 0; } else { rsm->r_is_drain = 0; rsm->r_is_gain = 0; } rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */ } /* * Returns 0, or the sequence where we stopped * updating. We also update the lenp to be the amount * of data left. */ static uint32_t bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ struct bbr_sendmap *nrsm; uint32_t c_end; int32_t len; len = *lenp; c_end = rsm->r_start + len; if (SEQ_GEQ(c_end, rsm->r_end)) { /* * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); if (c_end == rsm->r_end) { *lenp = 0; return (0); } else { int32_t act_len; /* Hangs over the end return whats left */ act_len = rsm->r_end - rsm->r_start; *lenp = (len - act_len); return (rsm->r_end); } /* We don't get out of this block. */ } /* * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ nrsm = bbr_alloc_full_limit(bbr); if (nrsm == NULL) { *lenp = 0; return (0); } /* * So here we are going to take the original rsm and make it what we * retransmitted. nrsm will be the tail portion we did not * retransmit. For example say the chunk was 1, 11 (10 bytes). And * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ bbr_clone_rsm(bbr, nrsm, rsm, c_end); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); nrsm->r_dupack = 0; if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); *lenp = 0; return (0); } static uint64_t bbr_get_hardware_rate(struct tcp_bbr *bbr) { uint64_t bw; bw = bbr_get_bw(bbr); bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]; bw /= (uint64_t)BBR_UNIT; return(bw); } static void bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts, uint64_t act_rate, uint64_t rate_wanted) { /* * We could not get a full gains worth * of rate. */ if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) { /* we can't even get the real rate */ uint64_t red; bbr->skip_gain = 1; bbr->gain_is_limited = 0; red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate; if (red) filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts); } else { /* We can use a lower gain */ bbr->skip_gain = 0; bbr->gain_is_limited = 1; } } static void bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts) { const struct tcp_hwrate_limit_table *nrte; int error, rate = -1; if (bbr->r_ctl.crte == NULL) return; if ((bbr->rc_inp->inp_route.ro_rt == NULL) || (bbr->rc_inp->inp_route.ro_rt->rt_ifp == NULL)) { /* Lost our routes? */ /* Clear the way for a re-attempt */ bbr->bbr_attempt_hdwr_pace = 0; lost_rate: bbr->gain_is_limited = 0; bbr->skip_gain = 0; bbr->bbr_hdrw_pacing = 0; counter_u64_add(bbr_flows_whdwr_pacing, -1); counter_u64_add(bbr_flows_nohdwr_pacing, 1); tcp_bbr_tso_size_check(bbr, cts); return; } rate = bbr_get_hardware_rate(bbr); nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp, bbr->rc_inp->inp_route.ro_rt->rt_ifp, rate, (RS_PACING_GEQ|RS_PACING_SUB_OK), &error); if (nrte == NULL) { goto lost_rate; } if (nrte != bbr->r_ctl.crte) { bbr->r_ctl.crte = nrte; if (error == 0) { BBR_STAT_INC(bbr_hdwr_rl_mod_ok); if (bbr->r_ctl.crte->rate < rate) { /* We have a problem */ bbr_setup_less_of_rate(bbr, cts, bbr->r_ctl.crte->rate, rate); } else { /* We are good */ bbr->gain_is_limited = 0; bbr->skip_gain = 0; } } else { /* A failure should release the tag */ BBR_STAT_INC(bbr_hdwr_rl_mod_fail); bbr->gain_is_limited = 0; bbr->skip_gain = 0; bbr->bbr_hdrw_pacing = 0; } bbr_type_log_hdwr_pacing(bbr, bbr->r_ctl.crte->ptbl->rs_ifp, rate, ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate), __LINE__, cts, error); } } static void bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts) { /* * If we have hardware pacing support * we need to factor that in for our * TSO size. */ const struct tcp_hwrate_limit_table *rlp; uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay; if ((bbr->bbr_hdrw_pacing == 0) || (IN_RECOVERY(bbr->rc_tp->t_flags)) || (bbr->r_ctl.crte == NULL)) return; if (bbr->hw_pacing_set == 0) { /* Not yet by the hdwr pacing count delay */ return; } if (bbr_hdwr_pace_adjust == 0) { /* No adjustment */ return; } rlp = bbr->r_ctl.crte; if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; else maxseg = BBR_MIN_SEG - bbr->rc_last_options; /* * So lets first get the * time we will take between * TSO sized sends currently without * hardware help. */ cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT, bbr->r_ctl.rc_pace_max_segs, cts, 1); hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg; hdwr_delay *= rlp->time_between; if (cur_delay > hdwr_delay) delta = cur_delay - hdwr_delay; else delta = 0; bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay, (bbr->r_ctl.rc_pace_max_segs / maxseg), 1); if (delta && (delta < (max(rlp->time_between, bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) { /* * Now lets divide by the pacing * time between each segment the * hardware sends rounding up and * derive a bytes from that. We multiply * that by bbr_hdwr_pace_adjust to get * more bang for our buck. * * The goal is to have the software pacer * waiting no more than an additional * pacing delay if we can (without the * compensation i.e. x bbr_hdwr_pace_adjust). */ seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between), (bbr->r_ctl.rc_pace_max_segs/maxseg)); seg_sz *= bbr_hdwr_pace_adjust; if (bbr_hdwr_pace_floor && (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { /* Currently hardware paces * out rs_min_seg segments at a time. * We need to make sure we always send at least * a full burst of bbr_hdwr_pace_floor down. */ seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; } seg_sz *= maxseg; } else if (delta == 0) { /* * The highest pacing rate is * above our b/w gained. This means * we probably are going quite fast at * the hardware highest rate. Lets just multiply * the calculated TSO size by the * multiplier factor (its probably * 4 segments in the default config for * mlx). */ seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust; if (bbr_hdwr_pace_floor && (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { /* Currently hardware paces * out rs_min_seg segments at a time. * We need to make sure we always send at least * a full burst of bbr_hdwr_pace_floor down. */ seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; } } else { /* * The pacing time difference is so * big that the hardware will * pace out more rapidly then we * really want and then we * will have a long delay. Lets just keep * the same TSO size so its as if * we were not using hdwr pacing (we * just gain a bit of spacing from the * hardware if seg_sz > 1). */ seg_sz = bbr->r_ctl.rc_pace_max_segs; } if (seg_sz > bbr->r_ctl.rc_pace_max_segs) new_tso = seg_sz; else new_tso = bbr->r_ctl.rc_pace_max_segs; if (new_tso >= (PACE_MAX_IP_BYTES-maxseg)) new_tso = PACE_MAX_IP_BYTES - maxseg; if (new_tso != bbr->r_ctl.rc_pace_max_segs) { bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0); bbr->r_ctl.rc_pace_max_segs = new_tso; } } static void tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts) { uint64_t bw; uint32_t old_tso = 0, new_tso; uint32_t maxseg, bytes; uint32_t tls_seg=0; /* * Google/linux uses the following algorithm to determine * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18): * * bytes = bw_in_bytes_per_second / 1000 * bytes = min(bytes, 64k) * tso_segs = bytes / MSS * if (bw < 1.2Mbs) * min_tso_segs = 1 * else * min_tso_segs = 2 * tso_segs = max(tso_segs, min_tso_segs) * * * Note apply a device specific limit (we apply this in the * tcp_m_copym). * Note that before the initial measurement is made google bursts out * a full iwnd just like new-reno/cubic. * * We do not use this algorithm. Instead we * use a two phased approach: * * if ( bw <= per-tcb-cross-over) * goal_tso = calculate how much with this bw we * can send in goal-time seconds. * if (goal_tso > mss) * seg = goal_tso / mss * tso = seg * mss * else * tso = mss * if (tso > per-tcb-max) * tso = per-tcb-max * else if ( bw > 512Mbps) * tso = max-tso (64k/mss) * else * goal_tso = bw / per-tcb-divsor * seg = (goal_tso + mss-1)/mss * tso = seg * mss * * if (tso < per-tcb-floor) * tso = per-tcb-floor * if (tso > per-tcb-utter_max) * tso = per-tcb-utter_max * * Note the default per-tcb-divisor is 1000 (same as google). * the goal cross over is 30Mbps however. To recreate googles * algorithm you need to set: * * cross-over = 23,168,000 bps * goal-time = 18000 * per-tcb-max = 2 * per-tcb-divisor = 1000 * per-tcb-floor = 1 * * This will get you "google bbr" behavior with respect to tso size. * * Note we do set anything TSO size until we are past the initial * window. Before that we gnerally use either a single MSS * or we use the full IW size (so we burst a IW at a time) * Also note that Hardware-TLS is special and does alternate * things to minimize PCI Bus Bandwidth use. */ if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) { maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; } else { maxseg = BBR_MIN_SEG - bbr->rc_last_options; } #ifdef KERN_TLS if (bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { tls_seg = ctf_get_opt_tls_size(bbr->rc_inp->inp_socket, bbr->rc_tp->snd_wnd); bbr->r_ctl.rc_pace_min_segs = (tls_seg + bbr->rc_last_options); } #endif old_tso = bbr->r_ctl.rc_pace_max_segs; if (bbr->rc_past_init_win == 0) { /* * Not enough data has been acknowledged to make a * judgement unless we are hardware TLS. Set up - * the inital TSO based on if we are sending a + * the initial TSO based on if we are sending a * full IW at once or not. */ if (bbr->rc_use_google) bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2); else if (bbr->bbr_init_win_cheat) bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp); else bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options; if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg) bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg; #ifdef KERN_TLS if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) && tls_seg) { /* * For hardware TLS we set our min to the tls_seg size. */ bbr->r_ctl.rc_pace_max_segs = tls_seg; bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options; } #endif if (bbr->r_ctl.rc_pace_max_segs == 0) { bbr->r_ctl.rc_pace_max_segs = maxseg; } bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0); #ifdef KERN_TLS if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0) #endif bbr_adjust_for_hw_pacing(bbr, cts); return; } /** * Now lets set the TSO goal based on our delivery rate in * bytes per second. Note we only do this if * we have acked at least the initial cwnd worth of data. */ bw = bbr_get_bw(bbr); if (IN_RECOVERY(bbr->rc_tp->t_flags) && (bbr->rc_use_google == 0)) { /* We clamp to one MSS in recovery */ new_tso = maxseg; } else if (bbr->rc_use_google) { int min_tso_segs; /* Google considers the gain too */ if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) { bw *= bbr->r_ctl.rc_bbr_hptsi_gain; bw /= BBR_UNIT; } bytes = bw / 1024; if (bytes > (64 * 1024)) bytes = 64 * 1024; new_tso = bytes / maxseg; if (bw < ONE_POINT_TWO_MEG) min_tso_segs = 1; else min_tso_segs = 2; if (new_tso < min_tso_segs) new_tso = min_tso_segs; new_tso *= maxseg; } else if (bbr->rc_no_pacing) { new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg; } else if (bw <= bbr->r_ctl.bbr_cross_over) { /* * Calculate the worse case b/w TSO if we are inserting no * more than a delay_target number of TSO's. */ uint32_t tso_len, min_tso; tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw); if (tso_len > maxseg) { new_tso = tso_len / maxseg; if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max) new_tso = bbr->r_ctl.bbr_hptsi_segments_max; new_tso *= maxseg; } else { /* * less than a full sized frame yikes.. long rtt or * low bw? */ min_tso = bbr_minseg(bbr); if ((tso_len > min_tso) && (bbr_all_get_min == 0)) new_tso = rounddown(tso_len, min_tso); else new_tso = min_tso; } } else if (bw > FIVETWELVE_MBPS) { /* * This guy is so fast b/w wise that we can TSO as large as * possible of segments that the NIC will allow. */ new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); } else { /* * This formula is based on attempting to send a segment or * more every bbr_hptsi_per_second. The default is 1000 * which means you are targeting what you can send every 1ms * based on the peers bw. * * If the number drops to say 500, then you are looking more * at 2ms and you will raise how much we send in a single * TSO thus saving CPU (less bbr_output_wtime() calls). The * trade off of course is you will send more at once and * thus tend to clump up the sends into larger "bursts" * building a queue. */ bw /= bbr->r_ctl.bbr_hptsi_per_second; new_tso = roundup(bw, (uint64_t)maxseg); /* * Gate the floor to match what our lower than 48Mbps * algorithm does. The ceiling (bbr_hptsi_segments_max) thus * becomes the floor for this calculation. */ if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg)) new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg); } if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor))) new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor; if (new_tso > PACE_MAX_IP_BYTES) new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); /* Enforce an utter maximum if we are not HW-TLS */ #ifdef KERN_TLS if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) == 0) #endif if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) { new_tso = bbr->r_ctl.bbr_utter_max * maxseg; } #ifdef KERN_TLS if (tls_seg) { /* * Lets move the output size * up to 1 or more TLS record sizes. */ uint32_t temp; temp = roundup(new_tso, tls_seg); new_tso = temp; /* Back down if needed to under a full frame */ while (new_tso > PACE_MAX_IP_BYTES) new_tso -= tls_seg; } #endif if (old_tso != new_tso) { /* Only log changes */ bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0); bbr->r_ctl.rc_pace_max_segs = new_tso; } #ifdef KERN_TLS if ((bbr->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) && tls_seg) { bbr->r_ctl.rc_pace_min_segs = tls_seg + bbr->rc_last_options; } else #endif /* We have hardware pacing and not hardware TLS! */ bbr_adjust_for_hw_pacing(bbr, cts); } static void bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t cts, struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc, struct sockbuf *sb) { struct bbr_sendmap *rsm, *nrsm; register uint32_t snd_max, snd_una; uint32_t pacing_time; /* * Add to the RACK log of packets in flight or retransmitted. If * there is a TS option we will use the TS echoed, if not we will * grab a TS. * * Retransmissions will increment the count and move the ts to its * proper place. Note that if options do not include TS's then we * won't be able to effectively use the ACK for an RTT on a retran. * * Notes about r_start and r_end. Lets consider a send starting at * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next * slot (11). * */ INP_WLOCK_ASSERT(tp->t_inpcb); if (err) { /* * We don't log errors -- we could but snd_max does not * advance in this case either. */ return; } if (th_flags & TH_RST) { /* * We don't log resets and we return immediately from * sending */ *abandon = 1; return; } snd_una = tp->snd_una; if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) { /* * The call to bbr_log_output is made before bumping * snd_max. This means we can record one extra byte on a SYN * or FIN if seq_out is adding more on and a FIN is present * (and we are not resending). */ if (th_flags & TH_SYN) len++; if (th_flags & TH_FIN) len++; } if (SEQ_LEQ((seq_out + len), snd_una)) { /* Are sending an old segment to induce an ack (keep-alive)? */ return; } if (SEQ_LT(seq_out, snd_una)) { /* huh? should we panic? */ uint32_t end; end = seq_out + len; seq_out = snd_una; len = end - seq_out; } snd_max = tp->snd_max; if (len == 0) { /* We don't log zero window probes */ return; } pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1); /* First question is it a retransmission? */ if (seq_out == snd_max) { again: rsm = bbr_alloc(bbr); if (rsm == NULL) { return; } rsm->r_flags = 0; if (th_flags & TH_SYN) rsm->r_flags |= BBR_HAS_SYN; if (th_flags & TH_FIN) rsm->r_flags |= BBR_HAS_FIN; rsm->r_tim_lastsent[0] = cts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = seq_out; rsm->r_end = rsm->r_start + len; rsm->r_dupack = 0; rsm->r_delivered = bbr->r_ctl.rc_delivered; rsm->r_pacing_delay = pacing_time; rsm->r_ts_valid = bbr->rc_ts_valid; if (bbr->rc_ts_valid) rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; rsm->r_del_time = bbr->r_ctl.rc_del_time; if (bbr->r_ctl.r_app_limited_until) rsm->r_app_limited = 1; else rsm->r_app_limited = 0; rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); /* * Here we must also add in this rsm since snd_max * is updated after we return from a new send. */ rsm->r_flight_at_send += len; TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) rsm->r_bbr_state = bbr_state_val(bbr); else rsm->r_bbr_state = 8; if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { rsm->r_is_gain = 1; rsm->r_is_drain = 0; } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { rsm->r_is_drain = 1; rsm->r_is_gain = 0; } else { rsm->r_is_drain = 0; rsm->r_is_gain = 0; } return; } /* * If we reach here its a retransmission and we need to find it. */ more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; hintrsm = NULL; } else if (bbr->r_ctl.rc_next) { /* We have a hint from a previous run */ rsm = bbr->r_ctl.rc_next; } else { /* No hints sorry */ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { /* * We used rc_next or hintrsm to retransmit, hopefully the * likely case. */ seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); if (len == 0) { return; } else { goto more; } } /* Ok it was not the last pointer go through it the hard way. */ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { if (rsm->r_start == seq_out) { seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (len == 0) { return; } else { continue; } } if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { /* Transmitted within this piece */ /* * Ok we must split off the front and then let the * update do the rest */ nrsm = bbr_alloc_full_limit(bbr); if (nrsm == NULL) { bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); return; } /* * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ bbr_clone_rsm(bbr, nrsm, rsm, seq_out); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time); if (len == 0) { return; } } } /* * Hmm not found in map did they retransmit both old and on into the * new? */ if (seq_out == tp->snd_max) { goto again; } else if (SEQ_LT(seq_out, tp->snd_max)) { #ifdef BBR_INVARIANTS printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } printf("Dump complete\n"); panic("seq_out not found rack:%p tp:%p", bbr, tp); #endif } else { #ifdef BBR_INVARIANTS /* * Hmm beyond sndmax? (only if we are using the new rtt-pack * flag) */ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", seq_out, len, tp->snd_max, tp); #endif } } static void bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt) { /* * Collapse timeout back the cum-ack moved. */ tp->t_rxtshift = 0; tp->t_softerror = 0; } static void tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin) { bbr->rtt_valid = 1; bbr->r_ctl.cur_rtt = rtt_usecs; bbr->r_ctl.ts_in = tsin; if (rsm_send_time) bbr->r_ctl.cur_rtt_send_time = rsm_send_time; } static void bbr_make_timestamp_determination(struct tcp_bbr *bbr) { /** * We have in our bbr control: * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp). * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts). * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts) * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time) * * Now we can calculate the time between the sends by doing: * * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts * * And the peer's time between receiving them by doing: * * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp * * We want to figure out if the timestamp values are in msec, 10msec or usec. * We also may find that we can't use the timestamps if say we see * that the peer_delta indicates that though we may have taken 10ms to * pace out the data, it only saw 1ms between the two packets. This would * indicate that somewhere on the path is a batching entity that is giving * out time-slices of the actual b/w. This would mean we could not use * reliably the peers timestamps. * * We expect delta > peer_delta initially. Until we figure out the * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio. * If we place 1000 there then its a ms vs our usec. If we place 10000 there * then its 10ms vs our usec. If the peer is running a usec clock we would * put a 1 there. If the value is faster then ours, we will disable the * use of timestamps (though we could revist this later if we find it to be not * just an isolated one or two flows)). * * To detect the batching middle boxes we will come up with our compensation and * if with it in place, we find the peer is drastically off (by some margin) in * the smaller direction, then we will assume the worst case and disable use of timestamps. * */ uint64_t delta, peer_delta, delta_up; delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts; if (delta < bbr_min_usec_delta) { /* * Have not seen a min amount of time * between our send times so we can * make a determination of the timestamp * yet. */ return; } peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp; if (peer_delta < bbr_min_peer_delta) { /* * We may have enough in the form of * our delta but the peers number * has not changed that much. It could * be its clock ratio is such that * we need more data (10ms tick) or * there may be other compression scenarios * going on. In any event we need the * spread to be larger. */ return; } /* Ok lets first see which way our delta is going */ if (peer_delta > delta) { /* Very unlikely, the peer without * compensation shows that it saw * the two sends arrive further apart * then we saw then in micro-seconds. */ if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) { /* well it looks like the peer is a micro-second clock. */ bbr->rc_ts_clock_set = 1; bbr->r_ctl.bbr_peer_tsratio = 1; } else { bbr->rc_ts_cant_be_used = 1; bbr->rc_ts_clock_set = 1; } return; } /* Ok we know that the peer_delta is smaller than our send distance */ bbr->rc_ts_clock_set = 1; /* First question is it within the percentage that they are using usec time? */ delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent; if ((peer_delta + delta_up) >= delta) { /* Its a usec clock */ bbr->r_ctl.bbr_peer_tsratio = 1; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* Ok if not usec, what about 10usec (though unlikely)? */ delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent; if (((peer_delta * 10) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 10; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* And what about 100usec (though again unlikely)? */ delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent; if (((peer_delta * 100) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 100; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* And how about 1 msec (the most likely one)? */ delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent; if (((peer_delta * 1000) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 1000; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* Ok if not msec could it be 10 msec? */ delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent; if (((peer_delta * 10000) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 10000; return; } /* If we fall down here the clock tick so slowly we can't use it */ bbr->rc_ts_cant_be_used = 1; bbr->r_ctl.bbr_peer_tsratio = 0; bbr_log_tstmp_validation(bbr, peer_delta, delta); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts) { int32_t delta; uint32_t rtt, tsin; int32_t rtt_ticks; if (bbr->rtt_valid == 0) /* No valid sample */ return; rtt = bbr->r_ctl.cur_rtt; tsin = bbr->r_ctl.ts_in; if (bbr->rc_prtt_set_ts) { /* * We are to force feed the rttProp filter due * to an entry into PROBE_RTT. This assures * that the times are sync'd between when we * go into PROBE_RTT and the filter expiration. * * Google does not use a true filter, so they do * this implicitly since they only keep one value * and when they enter probe-rtt they update the * value to the newest rtt. */ uint32_t rtt_prop; bbr->rc_prtt_set_ts = 0; rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (rtt > rtt_prop) filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts); else apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); } if (bbr->rc_ack_was_delayed) rtt += bbr->r_ctl.rc_ack_hdwr_delay; if (rtt < bbr->r_ctl.rc_lowest_rtt) bbr->r_ctl.rc_lowest_rtt = rtt; bbr_log_rtt_sample(bbr, rtt, tsin); if (bbr->r_init_rtt) { /* * The initial rtt is not-trusted, nuke it and lets get * our first valid measurement in. */ bbr->r_init_rtt = 0; tp->t_srtt = 0; } if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) { /* * So we have not yet figured out * what the peers TSTMP value is * in (most likely ms). We need a * series of cum-ack's to determine * this reliably. */ if (bbr->rc_ack_is_cumack) { if (bbr->rc_ts_data_set) { /* Lets attempt to determine the timestamp granularity. */ bbr_make_timestamp_determination(bbr); } else { bbr->rc_ts_data_set = 1; bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts; bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time; } } else { /* * We have to have consecutive acks * reset any "filled" state to none. */ bbr->rc_ts_data_set = 0; } } /* Round it up */ rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1))); if (rtt_ticks == 0) rtt_ticks = 1; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic is * equivalent to the smoothing algorithm in rfc793 with an * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). * Adjust rtt to origin 0. */ delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt += delta; if (tp->t_srtt <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit timer * to smoothed rtt + 4 times the smoothed variance. rttvar * is stored as fixed point with 4 bits after the binary * point (scaled by 16). The following is equivalent to * rfc793 smoothing with an alpha of .75 (rttvar = * rttvar*3/4 + |delta| / 4). This replaces rfc793's * wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); tp->t_rttvar += delta; if (tp->t_rttvar <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. Set the * variance to half the rtt (so our first retransmit happens * at 3*rtt). */ tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT; tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks)); #endif /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 * tick of bias. When we compute the retransmit timer, we want 1/2 * tick of rounding and 1 extra tick because of +-1/2 tick * uncertainty in the firing of the timer. The bias will give us * exactly the 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below the minimum * feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2), MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); /* * We received an ack for a packet that wasn't retransmitted; it is * probably safe to discard any error indications we've received * recently. This isn't quite right, but close enough for now (a * route might have failed after we sent a segment, and the return * path might not be symmetrical). */ tp->t_softerror = 0; rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt) bbr->r_ctl.bbr_smallest_srtt_this_state = rtt; } static void bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t t, uint32_t cts, int ack_type) { /* * For this RSM, we acknowledged the data from a previous * transmission, not the last one we made. This means we did a false * retransmit. */ if (rsm->r_flags & BBR_HAS_FIN) { /* * The sending of the FIN often is multiple sent when we * have everything outstanding ack'd. We ignore this case * since its over now. */ return; } if (rsm->r_flags & BBR_TLP) { /* * We expect TLP's to have this occur often */ bbr->rc_tlp_rtx_out = 0; return; } if (ack_type != BBR_CUM_ACKED) { /* * If it was not a cum-ack we * don't really know for sure since * the timestamp could be from some * other transmission. */ return; } if (rsm->r_flags & BBR_WAS_SACKPASS) { /* * We retransmitted based on a sack and the earlier * retransmission ack'd it - re-ordering is occuring. */ BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; } /* Back down the loss count */ if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; rsm->r_flags &= ~BBR_MARKED_LOST; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } /***** RRS HERE ************************/ /* Do we need to do this??? */ /* bbr_reset_lt_bw_sampling(bbr, cts); */ /***** RRS HERE ************************/ BBR_STAT_INC(bbr_badfr); BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start)); } static void bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line) { bbr->r_ctl.rc_rtt_shrinks = cts; if (bbr_can_force_probertt && (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { /* * We should enter probe-rtt its been too long * since we have been there. */ bbr_enter_probe_rtt(bbr, cts, __LINE__); } else bbr_check_probe_rtt_limits(bbr, cts); } static void tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts) { uint64_t orig_bw; if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) { /* We never apply a zero measurment */ bbr_log_type_bbrupd(bbr, 20, cts, 0, 0, 0, 0, 0, 0, 0, 0); return; } if (bbr->r_ctl.r_measurement_count < 0xffffffff) bbr->r_ctl.r_measurement_count++; orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate); apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch); bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw, (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate), 0, 0, 0, 0, 0, 0); if (orig_bw && (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) { if (bbr->bbr_hdrw_pacing) { /* * Apply a new rate to the hardware * possibly. */ bbr_update_hardware_pacing_rate(bbr, cts); } bbr_set_state_target(bbr, __LINE__); tcp_bbr_tso_size_check(bbr, cts); if (bbr->r_recovery_bw) { bbr_setup_red_bw(bbr, cts); bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW); } } else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate)) tcp_bbr_tso_size_check(bbr, cts); } static void bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) { if (bbr->rc_in_persist == 0) { /* We log only when not in persist */ /* Translate to a Bytes Per Second */ uint64_t tim, bw, ts_diff, ts_bw; uint32_t upper, lower, delivered; if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); else tim = 1; /* * Now that we have processed the tim (skipping the sample * or possibly updating the time, go ahead and * calculate the cdr. */ delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); bw = (uint64_t)delivered; bw *= (uint64_t)USECS_IN_SECOND; bw /= tim; if (bw == 0) { /* We must have a calculatable amount */ return; } upper = (bw >> 32) & 0x00000000ffffffff; lower = bw & 0x00000000ffffffff; /* * If we are using this b/w shove it in now so we * can see in the trace viewer if it gets over-ridden. */ if (rsm->r_ts_valid && bbr->rc_ts_valid && bbr->rc_ts_clock_set && (bbr->rc_ts_cant_be_used == 0) && bbr->rc_use_ts_limit) { ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1); ts_diff *= bbr->r_ctl.bbr_peer_tsratio; if ((delivered == 0) || (rtt < 1000)) { /* Can't use the ts */ bbr_log_type_bbrupd(bbr, 61, cts, ts_diff, bbr->r_ctl.last_inbound_ts, rsm->r_del_ack_ts, 0, 0, 0, 0, delivered); } else { ts_bw = (uint64_t)delivered; ts_bw *= (uint64_t)USECS_IN_SECOND; ts_bw /= ts_diff; bbr_log_type_bbrupd(bbr, 62, cts, (ts_bw >> 32), (ts_bw & 0xffffffff), 0, 0, 0, 0, ts_diff, delivered); if ((bbr->ts_can_raise) && (ts_bw > bw)) { bbr_log_type_bbrupd(bbr, 8, cts, delivered, ts_diff, (bw >> 32), (bw & 0x00000000ffffffff), 0, 0, 0, 0); bw = ts_bw; } else if (ts_bw && (ts_bw < bw)) { bbr_log_type_bbrupd(bbr, 7, cts, delivered, ts_diff, (bw >> 32), (bw & 0x00000000ffffffff), 0, 0, 0, 0); bw = ts_bw; } } } if (rsm->r_first_sent_time && TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { uint64_t sbw, sti; /* * We use what was in flight at the time of our * send and the size of this send to figure * out what we have been sending at (amount). * For the time we take from the time of * the send of the first send outstanding * until this send plus this sends pacing * time. This gives us a good calculation * as to the rate we have been sending at. */ sbw = (uint64_t)(rsm->r_flight_at_send); sbw *= (uint64_t)USECS_IN_SECOND; sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; sti += rsm->r_pacing_delay; sbw /= sti; if (sbw < bw) { bbr_log_type_bbrupd(bbr, 6, cts, delivered, (uint32_t)sti, (bw >> 32), (uint32_t)bw, rsm->r_first_sent_time, 0, (sbw >> 32), (uint32_t)sbw); bw = sbw; } } /* Use the google algorithm for b/w measurements */ bbr->r_ctl.rc_bbr_cur_del_rate = bw; if ((rsm->r_app_limited == 0) || (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) { tcp_bbr_commit_bw(bbr, cts); bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); } } } static void bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) { if (bbr->rc_in_persist == 0) { /* We log only when not in persist */ /* Translate to a Bytes Per Second */ uint64_t tim, bw; uint32_t upper, lower, delivered; int no_apply = 0; if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); else tim = 1; /* * Now that we have processed the tim (skipping the sample * or possibly updating the time, go ahead and * calculate the cdr. */ delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); bw = (uint64_t)delivered; bw *= (uint64_t)USECS_IN_SECOND; bw /= tim; if (tim < bbr->r_ctl.rc_lowest_rtt) { bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); no_apply = 1; } upper = (bw >> 32) & 0x00000000ffffffff; lower = bw & 0x00000000ffffffff; /* * If we are using this b/w shove it in now so we * can see in the trace viewer if it gets over-ridden. */ bbr->r_ctl.rc_bbr_cur_del_rate = bw; /* Gate by the sending rate */ if (rsm->r_first_sent_time && TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { uint64_t sbw, sti; /* * We use what was in flight at the time of our * send and the size of this send to figure * out what we have been sending at (amount). * For the time we take from the time of * the send of the first send outstanding * until this send plus this sends pacing * time. This gives us a good calculation * as to the rate we have been sending at. */ sbw = (uint64_t)(rsm->r_flight_at_send); sbw *= (uint64_t)USECS_IN_SECOND; sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; sti += rsm->r_pacing_delay; sbw /= sti; if (sbw < bw) { bbr_log_type_bbrupd(bbr, 6, cts, delivered, (uint32_t)sti, (bw >> 32), (uint32_t)bw, rsm->r_first_sent_time, 0, (sbw >> 32), (uint32_t)sbw); bw = sbw; } if ((sti > tim) && (sti < bbr->r_ctl.rc_lowest_rtt)) { bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); no_apply = 1; } else no_apply = 0; } bbr->r_ctl.rc_bbr_cur_del_rate = bw; if ((no_apply == 0) && ((rsm->r_app_limited == 0) || (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) { tcp_bbr_commit_bw(bbr, cts); bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); } } } static void bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin, uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to) { uint64_t old_rttprop; /* Update our delivery time and amount */ bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_del_time = cts; if (rtt == 0) { /* * 0 means its a retransmit, for now we don't use these for * the rest of BBR. */ return; } if ((bbr->rc_use_google == 0) && (match != BBR_RTT_BY_EXACTMATCH) && (match != BBR_RTT_BY_TIMESTAMP)){ /* * We get a lot of rtt updates, lets not pay attention to * any that are not an exact match. That way we don't have * to worry about timestamps and the whole nonsense of * unsure if its a retransmission etc (if we ever had the * timestamp fixed to always have the last thing sent this * would not be a issue). */ return; } if ((bbr_no_retran && bbr->rc_use_google) && (match != BBR_RTT_BY_EXACTMATCH) && (match != BBR_RTT_BY_TIMESTAMP)){ /* * We only do measurements in google mode * with bbr_no_retran on for sure things. */ return; } /* Only update srtt if we know by exact match */ tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin); if (ack_type == BBR_CUM_ACKED) bbr->rc_ack_is_cumack = 1; else bbr->rc_ack_is_cumack = 0; old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP); /* * Note the following code differs to the original * BBR spec. It calls for <= not <. However after a * long discussion in email with Neal, he acknowledged * that it should be < than so that we will have flows * going into probe-rtt (we were seeing cases where that * did not happen and caused ugly things to occur). We * have added this agreed upon fix to our code base. */ if (rtt < old_rttprop) { /* Update when we last saw a rtt drop */ bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts, match, rsm->r_start, rsm->r_flags); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) { /* * The RTT-prop moved, reset the target (may be a * nop for some states). */ bbr_set_state_target(bbr, __LINE__); if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_NEW_TARGET, 0); else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP)) /* It went up */ bbr_check_probe_rtt_limits(bbr, cts); } if ((bbr->rc_use_google == 0) && (match == BBR_RTT_BY_TIMESTAMP)) { /* * We don't do b/w update with * these since they are not really * reliable. */ return; } if (bbr->r_ctl.r_app_limited_until && (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) { /* We are no longer app-limited */ bbr->r_ctl.r_app_limited_until = 0; } if (bbr->rc_use_google) { bbr_google_measurement(bbr, rsm, rtt, cts); } else { bbr_nf_measurement(bbr, rsm, rtt, cts); } } /* * Convert a timestamp that the main stack * uses (milliseconds) into one that bbr uses * (microseconds). Return that converted timestamp. */ static uint32_t bbr_ts_convert(uint32_t cts) { uint32_t sec, msec; sec = cts / MS_IN_USEC; msec = cts - (MS_IN_USEC * sec); return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC)); } /* * Return 0 if we did not update the RTT time, return * 1 if we did. */ static int bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack) { int32_t i; uint32_t t, uts = 0; if ((rsm->r_flags & BBR_ACKED) || (rsm->r_flags & BBR_WAS_RENEGED) || (rsm->r_flags & BBR_RXT_CLEARED)) { /* Already done */ return (0); } if (rsm->r_rtr_cnt == 1) { /* * Only one transmit. Hopefully the normal case. */ if (TSTMP_GT(cts, rsm->r_tim_lastsent[0])) t = cts - rsm->r_tim_lastsent[0]; else t = 1; if ((int)t <= 0) t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to); return (1); } /* Convert to usecs */ if ((bbr_can_use_ts_for_rtt == 1) && (bbr->rc_use_google == 1) && (ack_type == BBR_CUM_ACKED) && (to->to_flags & TOF_TS) && (to->to_tsecr != 0)) { t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr; if (t < 1) t = 1; t *= MS_IN_USEC; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, BBR_RTT_BY_TIMESTAMP, rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)], ack_type, to); return (1); } uts = bbr_ts_convert(to->to_tsecr); if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0) && (ack_type == BBR_CUM_ACKED) && ((rsm->r_flags & BBR_OVERMAX) == 0)) { /* * Now which timestamp does it match? In this block the ACK * may be coming from a previous transmission. */ uint32_t fudge; fudge = BBR_TIMER_FUDGE; for (i = 0; i < rsm->r_rtr_cnt; i++) { if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) && (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) { if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) t = cts - rsm->r_tim_lastsent[i]; else t = 1; if ((int)t <= 0) t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING, rsm->r_tim_lastsent[i], ack_type, to); if ((i + 1) < rsm->r_rtr_cnt) { /* Likely */ bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); } else if (rsm->r_flags & BBR_TLP) { bbr->rc_tlp_rtx_out = 0; } return (1); } } /* Fall through if we can't find a matching timestamp */ } /* * Ok its a SACK block that we retransmitted. or a windows * machine without timestamps. We can tell nothing from the * time-stamp since its not there or the time the peer last * recieved a segment that moved forward its cum-ack point. * * Lets look at the last retransmit and see what we can tell * (with BBR for space we only keep 2 note we have to keep * at least 2 so the map can not be condensed more). */ i = rsm->r_rtr_cnt - 1; if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) t = cts - rsm->r_tim_lastsent[i]; else goto not_sure; if (t < bbr->r_ctl.rc_lowest_rtt) { /* * We retransmitted and the ack came back in less * than the smallest rtt we have observed in the * windowed rtt. We most likey did an improper * retransmit as outlined in 4.2 Step 3 point 2 in * the rack-draft. * * Use the prior transmission to update all the * information as long as there is only one prior * transmission. */ if ((rsm->r_flags & BBR_OVERMAX) == 0) { #ifdef BBR_INVARIANTS if (rsm->r_rtr_cnt == 1) panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags); #endif i = rsm->r_rtr_cnt - 2; if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) t = cts - rsm->r_tim_lastsent[i]; else t = 1; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET, rsm->r_tim_lastsent[i], ack_type, to); bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); } else { /* * Too many prior transmissions, just * updated BBR delivered */ not_sure: bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); } } else { /* * We retransmitted it and the retransmit did the * job. */ if (rsm->r_flags & BBR_TLP) bbr->rc_tlp_rtx_out = 0; if ((rsm->r_flags & BBR_OVERMAX) == 0) bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to); else bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); return (1); } return (0); } /* * Mark the SACK_PASSED flag on all entries prior to rsm send wise. */ static void bbr_log_sack_passed(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm) { struct bbr_sendmap *nrsm; nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap, bbr_head, r_tnext) { if (nrsm == rsm) { /* Skip orginal segment he is acked */ continue; } if (nrsm->r_flags & BBR_ACKED) { /* Skip ack'd segments */ continue; } if (nrsm->r_flags & BBR_SACK_PASSED) { /* * We found one that is already marked * passed, we have been here before and * so all others below this are marked. */ break; } BBR_STAT_INC(bbr_sack_passed); nrsm->r_flags |= BBR_SACK_PASSED; if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) && bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) { bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start; bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start; nrsm->r_flags |= BBR_MARKED_LOST; } nrsm->r_flags &= ~BBR_WAS_SACKPASS; } } /* * Returns the number of bytes that were * newly ack'd by sack blocks. */ static uint32_t bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack, struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts) { int32_t times = 0; uint32_t start, end, maxseg, changed = 0; struct bbr_sendmap *rsm, *nrsm; int32_t used_ref = 1; uint8_t went_back = 0, went_fwd = 0; maxseg = tp->t_maxseg - bbr->rc_last_options; start = sack->start; end = sack->end; rsm = *prsm; if (rsm == NULL) used_ref = 0; /* Do we locate the block behind where we last were? */ if (rsm && SEQ_LT(start, rsm->r_start)) { went_back = 1; TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { goto do_rest_ofb; } } } start_at_beginning: went_fwd = 1; /* * Ok lets locate the block where this guy is fwd from rsm (if its * set) */ TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { break; } } do_rest_ofb: if (rsm == NULL) { /* * This happens when we get duplicate sack blocks with the * same end. For example SACK 4: 100 SACK 3: 100 The sort * will not change there location so we would just start at * the end of the first one and get lost. */ if (tp->t_flags & TF_SENTFIN) { /* * Check to see if we have not logged the FIN that * went out. */ nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { /* * Ok we did not get the FIN logged. */ nrsm->r_end++; rsm = nrsm; goto do_rest_ofb; } } if (times == 1) { #ifdef BBR_INVARIANTS panic("tp:%p bbr:%p sack:%p to:%p prsm:%p", tp, bbr, sack, to, prsm); #else goto out; #endif } times++; BBR_STAT_INC(bbr_sack_proc_restart); rsm = NULL; goto start_at_beginning; } /* Ok we have an ACK for some piece of rsm */ if (rsm->r_start != start) { /* * Need to split this in two pieces the before and after. */ if (bbr_sack_mergable(rsm, start, end)) nrsm = bbr_alloc_full_limit(bbr); else nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* We could not allocate ignore the sack */ struct sackblk blk; blk.start = start; blk.end = end; sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); goto out; } bbr_clone_rsm(bbr, nrsm, rsm, start); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); rsm = nrsm; } if (SEQ_GEQ(end, rsm->r_end)) { /* * The end of this block is either beyond this guy or right * at this guy. */ if ((rsm->r_flags & BBR_ACKED) == 0) { bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); changed += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); bbr_log_sack_passed(tp, bbr, rsm); if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } /* Is Reordering occuring? */ if (rsm->r_flags & BBR_SACK_PASSED) { BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } } rsm->r_flags |= BBR_ACKED; rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } } bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); if (end == rsm->r_end) { /* This block only - done */ goto out; } /* There is more not coverend by this rsm move on */ start = rsm->r_end; nrsm = TAILQ_NEXT(rsm, r_next); rsm = nrsm; times = 0; goto do_rest_ofb; } if (rsm->r_flags & BBR_ACKED) { /* Been here done that */ goto out; } /* Ok we need to split off this one at the tail */ if (bbr_sack_mergable(rsm, start, end)) nrsm = bbr_alloc_full_limit(bbr); else nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* failed XXXrrs what can we do but loose the sack info? */ struct sackblk blk; blk.start = start; blk.end = end; sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); goto out; } /* Clone it */ bbr_clone_rsm(bbr, nrsm, rsm, end); /* The sack block does not cover this guy fully */ rsm->r_flags &= (~BBR_HAS_FIN); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } nrsm->r_dupack = 0; bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); changed += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); bbr_log_sack_passed(tp, bbr, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } if (rsm->r_flags & BBR_SACK_PASSED) { BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } } rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); rsm->r_flags |= BBR_ACKED; if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } out: if (rsm && (rsm->r_flags & BBR_ACKED)) { /* * Now can we merge this newly acked * block with either the previous or * next block? */ nrsm = TAILQ_NEXT(rsm, r_next); if (nrsm && (nrsm->r_flags & BBR_ACKED)) { /* yep this and next can be merged */ rsm = bbr_merge_rsm(bbr, rsm, nrsm); } /* Now what about the previous? */ nrsm = TAILQ_PREV(rsm, bbr_head, r_next); if (nrsm && (nrsm->r_flags & BBR_ACKED)) { /* yep the previous and this can be merged */ rsm = bbr_merge_rsm(bbr, nrsm, rsm); } } if (used_ref == 0) { BBR_STAT_INC(bbr_sack_proc_all); } else { BBR_STAT_INC(bbr_sack_proc_short); } if (went_fwd && went_back) { BBR_STAT_INC(bbr_sack_search_both); } else if (went_fwd) { BBR_STAT_INC(bbr_sack_search_fwd); } else if (went_back) { BBR_STAT_INC(bbr_sack_search_back); } /* Save off where the next seq is */ if (rsm) bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); else bbr->r_ctl.rc_sacklast = NULL; *prsm = rsm; return (changed); } static void inline bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack) { struct bbr_sendmap *tmap; BBR_STAT_INC(bbr_reneges_seen); tmap = NULL; while (rsm && (rsm->r_flags & BBR_ACKED)) { /* Its no longer sacked, mark it so */ uint32_t oflags; bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); #ifdef BBR_INVARIANTS if (rsm->r_in_tmap) { panic("bbr:%p rsm:%p flags:0x%x in tmap?", bbr, rsm, rsm->r_flags); } #endif oflags = rsm->r_flags; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST); rsm->r_flags |= BBR_WAS_RENEGED; rsm->r_flags |= BBR_RXT_CLEARED; bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__); /* Rebuild it into our tmap */ if (tmap == NULL) { TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); tmap = rsm; } else { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext); tmap = rsm; } tmap->r_in_tmap = 1; /* * XXXrrs Delivered? Should we do anything here? * * Of course we don't on a rxt timeout so maybe its ok that * we don't? * * For now lets not. */ rsm = TAILQ_NEXT(rsm, r_next); } /* * Now lets possibly clear the sack filter so we start recognizing * sacks that cover this area. */ sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack); } static void bbr_log_syn(struct tcpcb *tp, struct tcpopt *to) { struct tcp_bbr *bbr; struct bbr_sendmap *rsm; uint32_t cts; bbr = (struct tcp_bbr *)tp->t_fb_ptr; cts = bbr->r_ctl.rc_rcvtime; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm && (rsm->r_flags & BBR_HAS_SYN)) { if ((rsm->r_end - rsm->r_start) <= 1) { /* Log out the SYN completely */ bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } if (bbr->r_ctl.rc_next == rsm) { /* scoot along the marker */ bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); } if (to != NULL) bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0); bbr_free(bbr, rsm); } else { /* There is more (Fast open)? strip out SYN. */ rsm->r_flags &= ~BBR_HAS_SYN; rsm->r_start++; } } } /* * Returns the number of bytes that were * acknowledged by SACK blocks. */ static uint32_t bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, uint32_t *prev_acked) { uint32_t changed, last_seq, entered_recovery = 0; struct tcp_bbr *bbr; struct bbr_sendmap *rsm; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; register uint32_t th_ack; int32_t i, j, k, new_sb, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; uint32_t p_maxseg, maxseg, p_acked = 0; INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { /* We don't log resets */ return (0); } bbr = (struct tcp_bbr *)tp->t_fb_ptr; cts = bbr->r_ctl.rc_rcvtime; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); changed = 0; maxseg = tp->t_maxseg - bbr->rc_last_options; p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg); th_ack = th->th_ack; if (SEQ_GT(th_ack, tp->snd_una)) { acked = th_ack - tp->snd_una; bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__); bbr->rc_tp->t_acktime = ticks; } else acked = 0; if (SEQ_LEQ(th_ack, tp->snd_una)) { /* Only sent here for sack processing */ goto proc_sack; } if (rsm && SEQ_GT(th_ack, rsm->r_start)) { changed = th_ack - rsm->r_start; } else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) { /* * For the SYN incoming case we will not have called * tcp_output for the sending of the SYN, so there will be * no map. All other cases should probably be a panic. */ if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) { /* * We have a timestamp that can be used to generate * an initial RTT. */ uint32_t ts, now, rtt; ts = bbr_ts_convert(to->to_tsecr); now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv)); rtt = now - ts; if (rtt < 1) rtt = 1; bbr_log_type_bbrrttprop(bbr, rtt, tp->iss, 0, cts, BBR_RTT_BY_TIMESTAMP, tp->iss, 0); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); changed = 1; bbr->r_wanted_output = 1; goto out; } goto proc_sack; } else if (rsm == NULL) { goto out; } if (changed) { /* * The ACK point is advancing to th_ack, we must drop off * the packets in the rack log and calculate any eligble * RTT's. */ bbr->r_wanted_output = 1; more: if (rsm == NULL) { if (tp->t_flags & TF_SENTFIN) { /* if we send a FIN we will not hav a map */ goto proc_sack; } #ifdef BBR_INVARIANTS panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n", tp, th, tp->t_state, bbr, tp->snd_una, tp->snd_max, changed); #endif goto proc_sack; } } if (SEQ_LT(th_ack, rsm->r_start)) { /* Huh map is missing this */ #ifdef BBR_INVARIANTS printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n", rsm->r_start, th_ack, tp->t_state, bbr->r_state, bbr); panic("th-ack is bad bbr:%p tp:%p", bbr, tp); #endif goto proc_sack; } else if (th_ack == rsm->r_start) { /* None here to ack */ goto proc_sack; } /* * Clear the dup ack counter, it will * either be freed or if there is some * remaining we need to start it at zero. */ rsm->r_dupack = 0; /* Now do we consume the whole thing? */ if (SEQ_GEQ(th_ack, rsm->r_end)) { /* Its all consumed. */ uint32_t left; if (rsm->r_flags & BBR_ACKED) { /* * It was acked on the scoreboard -- remove it from * total */ p_acked += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); if (bbr->r_ctl.rc_sacked == 0) bbr->r_ctl.rc_sacklast = NULL; } else { bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack); if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } if (rsm->r_flags & BBR_SACK_PASSED) { /* * There are acked segments ACKED on the * scoreboard further up. We are seeing * reordering. */ BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } } rsm->r_flags &= ~BBR_MARKED_LOST; } bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } if (bbr->r_ctl.rc_next == rsm) { /* scoot along the marker */ bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); } bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); /* Adjust the packet counts */ left = th_ack - rsm->r_end; /* Free back to zone */ bbr_free(bbr, rsm); if (left) { rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); goto more; } goto proc_sack; } if (rsm->r_flags & BBR_ACKED) { /* * It was acked on the scoreboard -- remove it from total * for the part being cum-acked. */ p_acked += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start); if (bbr->r_ctl.rc_sacked == 0) bbr->r_ctl.rc_sacklast = NULL; } else { /* * It was acked up to th_ack point for the first time */ struct bbr_sendmap lrsm; memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap)); lrsm.r_end = th_ack; bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack); } if ((rsm->r_flags & BBR_MARKED_LOST) && ((rsm->r_flags & BBR_ACKED) == 0)) { /* * It was marked lost and partly ack'd now * for the first time. We lower the rc_lost_bytes * and still leave it MARKED. */ bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start; } bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; /* adjust packet count */ rsm->r_start = th_ack; proc_sack: /* Check for reneging */ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to the edge of this send, * i.e. one that it had previously acked. The only way that * can be true if the peer threw away data (space issues) * that it had previously sacked (else it would have given * us snd_una up to (rsm->r_end). We need to undo the acked * markings here. * * Note we have to look to make sure th_ack is our * rsm->r_start in case we get an old ack where th_ack is * behind snd_una. */ bbr_peer_reneges(bbr, rsm, th->th_ack); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left to log */ goto out; } rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); if (rsm) { last_seq = rsm->r_end; } else { last_seq = tp->snd_max; } /* Sack block processing */ if (SEQ_GT(th_ack, tp->snd_una)) ack_point = th_ack; else ack_point = tp->snd_una; for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, ack_point) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && SEQ_LEQ(sack.end, tp->snd_max)) { if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) && (SEQ_LT(sack.end, last_seq)) && ((sack.end - sack.start) < (p_maxseg / 8))) { /* * Not the last piece and its smaller than * 1/8th of a p_maxseg. We ignore this. */ BBR_STAT_INC(bbr_runt_sacks); continue; } sack_blocks[num_sack_blks] = sack; num_sack_blks++; #ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ tcp_record_dsack(sack.start, sack.end); #endif } } if (num_sack_blks == 0) goto out; /* * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks); BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks); BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb)); num_sack_blks = new_sb; if (num_sack_blks < 2) { goto do_sack_work; } /* Sort the sacks */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } /* * Now are any of the sack block ends the same (yes some * implememtations send these)? */ again: if (num_sack_blks > 1) { for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (sack_blocks[i].end == sack_blocks[j].end) { /* * Ok these two have the same end we * want the smallest end and then * throw away the larger and start * again. */ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { /* * The second block covers * more area use that */ sack_blocks[i].start = sack_blocks[j].start; } /* * Now collapse out the dup-sack and * lower the count */ for (k = (j + 1); k < num_sack_blks; k++) { sack_blocks[j].start = sack_blocks[k].start; sack_blocks[j].end = sack_blocks[k].end; j++; } num_sack_blks--; goto again; } } } } do_sack_work: rsm = bbr->r_ctl.rc_sacklast; for (i = 0; i < num_sack_blks; i++) { acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts); if (acked) { bbr->r_wanted_output = 1; changed += acked; sack_changed += acked; } } out: *prev_acked = p_acked; if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { /* * Ok we have a high probability that we need to go in to * recovery since we have data sack'd */ struct bbr_sendmap *rsm; rsm = bbr_check_recovery_mode(tp, bbr, cts); if (rsm) { /* Enter recovery */ entered_recovery = 1; bbr->r_wanted_output = 1; /* * When we enter recovery we need to assure we send * one packet. */ if (bbr->r_ctl.rc_resend == NULL) { bbr->r_ctl.rc_resend = rsm; } } } if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { /* * See if we need to rack-retransmit anything if so set it * up as the thing to resend assuming something else is not * already in that position. */ if (bbr->r_ctl.rc_resend == NULL) { bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); } } /* * We return the amount that changed via sack, this is used by the * ack-received code to augment what was changed between th_ack <-> * snd_una. */ return (sack_changed); } static void bbr_strike_dupack(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (rsm && (rsm->r_dupack < 0xff)) { rsm->r_dupack++; if (rsm->r_dupack >= DUP_ACK_THRESHOLD) bbr->r_wanted_output = 1; } } /* * Return value of 1, we do not need to call bbr_process_data(). * return value of 0, bbr_process_data can be called. * For ret_val if its 0 the TCB is locked and valid, if its non-zero * its unlocked and probably unsafe to touch the TCB. */ static int bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val) { int32_t ourfinisacked = 0; int32_t acked_amount; uint16_t nsegs; int32_t acked; uint32_t lost, sack_changed = 0; struct mbuf *mfree; struct tcp_bbr *bbr; uint32_t prev_acked = 0; bbr = (struct tcp_bbr *)tp->t_fb_ptr; lost = bbr->r_ctl.rc_lost; nsegs = max(1, m->m_pkthdr.lro_nsegs); if (SEQ_GT(th->th_ack, tp->snd_max)) { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); bbr->r_wanted_output = 1; return (1); } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { /* Process the ack */ if (bbr->rc_in_persist) tp->t_rxtshift = 0; if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) bbr_strike_dupack(bbr); sack_changed = bbr_log_ack(tp, to, th, &prev_acked); } bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost)); if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* * Old ack, behind the last one rcv'd or a duplicate ack * with SACK info. */ if (th->th_ack == tp->snd_una) { bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0); if (bbr->r_state == TCPS_SYN_SENT) { /* * Special case on where we sent SYN. When * the SYN-ACK is processed in syn_sent * state it bumps the snd_una. This causes * us to hit here even though we did ack 1 * byte. * * Go through the nothing left case so we * send data. */ goto nothing_left; } } return (0); } /* * If we reach this point, ACK is not a duplicate, i.e., it ACKs * something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our SYN has * been ACK'd (so connection is now fully synchronized). Go * to non-starred state, increment snd_una for ACK of SYN, * and check if we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK arrives * within our recovery window, then it was a mistake to do the * retransmit in the first place. Recover our original cwnd and * ssthresh, and proceed to transmit where we left off. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); } SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); if (SEQ_GT(th->th_ack, tp->snd_una)) { bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); } tp->snd_una = th->th_ack; bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost)); if (IN_RECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover) && (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_bbr_partialack(tp); } else { bbr_post_recovery(tp); } } if (SEQ_GT(tp->snd_una, tp->snd_recover)) { tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->snd_nxt = tp->snd_max; } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ nothing_left: bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) bbr->rc_tp->t_acktime = 0; if ((sbused(&so->so_snd) == 0) && (tp->t_flags & TF_SENTFIN)) { ourfinisacked = 1; } bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); if (bbr->rc_in_persist == 0) { bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; } sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); /* * We invalidate the last ack here since we * don't want to transfer forward the time * for our sum's calculations. */ if ((tp->t_state >= TCPS_FIN_WAIT_1) && (sbavail(&so->so_snd) == 0) && (tp->t_flags2 & TF2_DROP_AF_DATA)) { /* * The socket was gone and the peer sent data, time * to reset him. */ *ret_val = 1; tp = tcp_close(tp); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); BBR_STAT_INC(bbr_dropped_af_data); return (1); } /* Set need output so persist might get set */ bbr->r_wanted_output = 1; } if (ofia) *ofia = ourfinisacked; return (0); } static void bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) { if (bbr->rc_in_persist == 0) { bbr_timer_cancel(bbr, __LINE__, cts); bbr->r_ctl.rc_last_delay_val = 0; tp->t_rxtshift = 0; bbr->rc_in_persist = 1; bbr->r_ctl.rc_went_idle_time = cts; /* We should be capped when rw went to 0 but just in case */ bbr_log_type_pesist(bbr, cts, 0, line, 1); /* Time freezes for the state, so do the accounting now */ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { int32_t idx; idx = bbr_state_val(bbr); counter_u64_add(bbr_state_time[(idx + 5)], time_in); } else { counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } } bbr->r_ctl.rc_bbr_state_time = cts; } } static void bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time) { /* * Note that if idle time does not exceed our * threshold, we do nothing continuing the state * transitions we were last walking through. */ if (idle_time >= bbr_idle_restart_threshold) { if (bbr->rc_use_idle_restart) { bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT; /* * Set our target using BBR_UNIT, so * we increase at a dramatic rate but * we stop when we get the pipe * full again for our current b/w estimate. */ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_set_state_target(bbr, __LINE__); /* Now setup our gains to ramp up */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; bbr_log_type_statechange(bbr, cts, __LINE__); } else { bbr_substate_change(bbr, cts, __LINE__, 1); } } } static void bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) { uint32_t idle_time; if (bbr->rc_in_persist == 0) return; idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time); bbr->rc_in_persist = 0; bbr->rc_hit_state_1 = 0; tp->t_flags &= ~TF_FORCEDATA; bbr->r_ctl.rc_del_time = cts; /* * We invalidate the last ack here since we * don't want to transfer forward the time * for our sum's calculations. */ if (bbr->rc_inp->inp_in_hpts) { tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); bbr->rc_timer_first = 0; bbr->r_ctl.rc_hpts_flags = 0; bbr->r_ctl.rc_last_delay_val = 0; bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; } bbr_log_type_pesist(bbr, cts, idle_time, line, 0); if (idle_time >= bbr_rtt_probe_time) { /* * This qualifies as a RTT_PROBE session since we drop the * data outstanding to nothing and waited more than * bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0); bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts; } tp->t_rxtshift = 0; /* * If in probeBW and we have persisted more than an RTT lets do * special handling. */ /* Force a time based epoch */ bbr_set_epoch(bbr, cts, __LINE__); /* * Setup the lost so we don't count anything against the guy * we have been stuck with during persists. */ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; /* Time un-freezes for the state */ bbr->r_ctl.rc_bbr_state_time = cts; if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) || (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) { /* * If we are going back to probe-bw * or probe_rtt, we may need to possibly * do a fast restart. */ bbr_restart_after_idle(bbr, cts, idle_time); } } static void bbr_collapsed_window(struct tcp_bbr *bbr) { /* * Now we must walk the * send map and divide the * ones left stranded. These * guys can't cause us to abort * the connection and are really * "unsent". However if a buggy * client actually did keep some * of the data i.e. collapsed the win * and refused to ack and then opened * the win and acked that data. We would * get into an ack war, the simplier * method then of just pretending we * did not send those segments something * won't work. */ struct bbr_sendmap *rsm, *nrsm; tcp_seq max_seq; uint32_t maxseg; int can_split = 0; int fnd = 0; maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd; bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { /* Find the first seq past or at maxseq */ if (rsm->r_flags & BBR_RWND_COLLAPSED) rsm->r_flags &= ~BBR_RWND_COLLAPSED; if (SEQ_GEQ(max_seq, rsm->r_start) && SEQ_GEQ(rsm->r_end, max_seq)) { fnd = 1; break; } } bbr->rc_has_collapsed = 0; if (!fnd) { /* Nothing to do strange */ return; } /* * Now can we split? * * We don't want to split if splitting * would generate too many small segments * less we let an attacker fragment our * send_map and leave us out of memory. */ if ((max_seq != rsm->r_start) && (max_seq != rsm->r_end)){ /* can we split? */ int res1, res2; res1 = max_seq - rsm->r_start; res2 = rsm->r_end - max_seq; if ((res1 >= (maxseg/8)) && (res2 >= (maxseg/8))) { /* No small pieces here */ can_split = 1; } else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) { /* We are under the limit */ can_split = 1; } } /* Ok do we need to split this rsm? */ if (max_seq == rsm->r_start) { /* It's this guy no split required */ nrsm = rsm; } else if (max_seq == rsm->r_end) { /* It's the next one no split required. */ nrsm = TAILQ_NEXT(rsm, r_next); if (nrsm == NULL) { /* Huh? */ return; } } else if (can_split && SEQ_LT(max_seq, rsm->r_end)) { /* yep we need to split it */ nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* failed XXXrrs what can we do mark the whole? */ nrsm = rsm; goto no_split; } /* Clone it */ bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0); bbr_clone_rsm(bbr, nrsm, rsm, max_seq); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } } else { /* * Split not allowed just start here just * use this guy. */ nrsm = rsm; } no_split: BBR_STAT_INC(bbr_collapsed_win); /* reuse fnd as a count */ fnd = 0; TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) { nrsm->r_flags |= BBR_RWND_COLLAPSED; fnd++; bbr->rc_has_collapsed = 1; } bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd); } static void bbr_un_collapse_window(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; int cleared = 0; TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if (rsm->r_flags & BBR_RWND_COLLAPSED) { /* Clear the flag */ rsm->r_flags &= ~BBR_RWND_COLLAPSED; cleared++; } else break; } bbr_log_type_rwnd_collapse(bbr, (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared); bbr->rc_has_collapsed = 0; } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { /* * Update window information. Don't look at window if no ACK: TAC's * send garbage on first SYN. */ uint16_t nsegs; int32_t tfo_syn; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); nsegs = max(1, m->m_pkthdr.lro_nsegs); if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; bbr->r_wanted_output = 1; } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; } } if (tp->snd_wnd < ctf_outstanding(tp)) /* The peer collapsed its window on us */ bbr_collapsed_window(bbr); else if (bbr->rc_has_collapsed) bbr_un_collapse_window(bbr); /* Was persist timer active and now we have window space? */ if ((bbr->rc_in_persist != 0) && (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr)))) { /* * Make the rate persist at end of persist mode if idle long * enough */ bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); /* Make sure we output to start the timer */ bbr->r_wanted_output = 1; } /* Do we need to enter persist? */ if ((bbr->rc_in_persist == 0) && (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd) && (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { /* No send window.. we must enter persist */ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); return (0); } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept random * urgent pointers, we'll crash in soreceive. It's hard to * imagine someone actually wanting to send this much urgent * data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, then * mark the data stream. This should not happen in * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a * FIN has been received from the remote side. In these * states we ignore the URG. * * According to RFC961 (Assigned Protocols), the urgent * pointer points to the last octet of urgent data. We * continue, however, to consider it to indicate the first * octet of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, pull receive urgent * pointer along with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing * queue, and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data is * presented to the user (this happens in tcp_usrreq.c, case * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly * queue with control block tp. Set thflags to whether * reassembly now includes a segment with FIN. This handles * the common case inline (segment is the next to be * received on an established connection, and the queue is * empty), avoiding linkage into and removal from the queue * and repetition of various conversions. Set DELACK for * segments received in order, but ack immediately when * segments are out of order (so fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { #ifdef NETFLIX_SB_LIMITS u_int mcnt, appended; if (so->so_rcv.sb_shlim) { mcnt = m_memcnt(m); appended = 0; if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, CFO_NOSLEEP, NULL) == false) { counter_u64_add(tcp_sb_shlim_fails, 1); m_freem(m); return (0); } } #endif if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) { bbr->bbr_segs_rcvd += max(1, nsegs); tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { bbr->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else #ifdef NETFLIX_SB_LIMITS appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim && appended != mcnt) counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); #endif } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs when * trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know that the * connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized (ie NEEDSYN * flag on) then delay ACK, so it may be piggybacked * when SYN is sent. Otherwise, since we received a * FIN then no more input can be expected, send ACK * now. */ if (tp->t_flags & TF_NEEDSYN) { tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES enter the * CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been * acked so enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the * other standard timers. */ case TCPS_FIN_WAIT_2: bbr->rc_timer_first = 1; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_twstart(tp); return (1); } } /* * Return any desired output. */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > ctf_outstanding(tp))) { bbr->r_wanted_output = 1; } INP_WLOCK_ASSERT(tp->t_inpcb); return (0); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. Return 1 if we processed the packet * return 0 if you need to take the "slow-path". */ static int bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt) { uint16_t nsegs; int32_t newsize = 0; /* automatic sockbuf scaling */ struct tcp_bbr *bbr; #ifdef NETFLIX_SB_LIMITS u_int mcnt, appended; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* On the hpts and we would have called output */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if (bbr->r_ctl.rc_resend != NULL) { return (0); } if (tiwin && tiwin != tp->snd_wnd) { return (0); } if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { return (0); } if (__predict_false((to->to_flags & TOF_TS) && (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { return (0); } if (__predict_false((th->th_ack != tp->snd_una))) { return (0); } if (__predict_false(tlen > sbspace(&so->so_rcv))) { return (0); } if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * This is a pure, in-sequence data packet with nothing on the * reassembly queue and we have enough buffer space to take it. */ nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim) { mcnt = m_memcnt(m); appended = 0; if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, CFO_NOSLEEP, NULL) == false) { counter_u64_add(tcp_sb_shlim_fails, 1); m_freem(m); return (1); } } #endif /* Clean receiver SACK report if present */ if (tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. Give up when limit is * reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ #ifdef NETFLIX_SB_LIMITS appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); ctf_calc_rwin(so, tp); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim && mcnt != appended) counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); #endif if (DELAY_ACK(tp, bbr, nsegs)) { bbr->bbr_segs_rcvd += max(1, nsegs); tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { bbr->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; } return (1); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. If we return 1, then all is well and * the packet is fully processed. */ static int bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt) { int32_t acked; uint16_t nsegs; uint32_t sack_changed; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif uint32_t prev_acked = 0; struct tcp_bbr *bbr; if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (__predict_false(bbr->r_ctl.rc_resend != NULL)) { /* We are retransmitting */ return (0); } if (__predict_false(bbr->rc_in_persist != 0)) { /* In persist mode */ return (0); } if (bbr->r_ctl.rc_sacked) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ nsegs = max(1, m->m_pkthdr.lro_nsegs); sack_changed = bbr_log_ack(tp, to, th, &prev_acked); /* * We never detect loss in fast ack [we can't * have a sack and can't be in recovery so * we always pass 0 (nothing detected)]. */ bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* Do we need to exit persists? */ if ((bbr->rc_in_persist != 0) && (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr)))) { bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); bbr->r_wanted_output = 1; } /* Do we need to enter persists? */ if ((bbr->rc_in_persist == 0) && (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd) && (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { /* No send window.. we must enter persist */ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = bbr->r_ctl.rc_rcvtime; tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(th->th_ack, tp->snd_una)) bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); tp->snd_una = th->th_ack; if (tp->snd_wnd < ctf_outstanding(tp)) /* The peer collapsed its window on us */ bbr_collapsed_window(bbr); else if (bbr->rc_has_collapsed) bbr_un_collapse_window(bbr); if (SEQ_GT(tp->snd_una, tp->snd_recover)) { tp->snd_recover = tp->snd_una; } bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0); /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ tp->snd_wl2 = th->th_ack; m_freem(m); /* * If all outstanding data are acked, stop retransmit timer, * otherwise restart timer using current (possibly backed-off) * value. If process is waiting for space, wakeup/selwakeup/signal. * If data are ready to send, let tcp_output decide between more * output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* Wake up the socket if we have room to write more */ sowwakeup(so); if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) bbr->rc_tp->t_acktime = 0; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); if (bbr->rc_in_persist == 0) { bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; } sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); /* * We invalidate the last ack here since we * don't want to transfer forward the time * for our sum's calculations. */ bbr->r_wanted_output = 1; } if (sbavail(&so->so_snd)) { bbr->r_wanted_output = 1; } return (1); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t todrop; int32_t ourfinisacked = 0; struct tcp_bbr *bbr; int32_t ret_val = 0; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the * connection. if seg does not contain SYN, then drop it. Otherwise * this is an acceptable SYN segment initialize tp->rcv_nxt and * tp->irs if seg contains ack then advance tp->snd_una. BRR does * not support ECN so we will not say we are capable. if SYN has * been acked change to ESTABLISHED else SYN_RCVD state arrange for * segment to be acked (eventually) continue processing rest of * data/controls, beginning with URG */ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); ctf_do_drop(m, tp); return (1); } if (thflags & TH_RST) { ctf_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { ctf_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial = 1; } /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && (tfo_partial == 0)) { bbr->bbr_segs_rcvd += 1; tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { bbr->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; } if (SEQ_GT(th->th_ack, tp->iss)) { /* * The SYN is acked * handle it specially. */ bbr_log_syn(tp, to); } if (SEQ_GT(th->th_ack, tp->snd_una)) { /* * We advance snd_una for the * fast open case. If th_ack is * acknowledging data beyond * snd_una we can't just call * ack-processing since the * data stream in our send-map * will start at snd_una + 1 (one * beyond the SYN). If its just * equal we don't need to do that * and there is no send_map. */ tp->snd_una++; } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => simultaneous * open. If segment contains CC option and there is a * cached CC, apply TAO test. If it succeeds, connection is * * half-synchronized. Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If * there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. If data, * trim to stay within window, dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. If the * remote host used T/TCP to validate the SYN, our data will be * ACK'd; if so, enter normal data segment processing in the middle * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { if ((to->to_flags & TOF_TS) != 0) { uint32_t t, rtt; t = tcp_tv_to_mssectick(&bbr->rc_tv); if (TSTMP_GEQ(t, to->to_tsecr)) { rtt = t - to->to_tsecr; if (rtt == 0) { rtt = 1; } rtt *= MS_IN_USEC; tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime); } } if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ if (tp->t_state == TCPS_FIN_WAIT_1) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now * acknowledged then enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then * closing user can proceed. Starting the * timer is contrary to the specification, * but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and * use a compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the only valid * packets are the initial SYN, a retransmit/copy of the * initial SYN (possibly with a subset of the original * data), a valid ACK, a FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { ctf_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { ctf_do_drop(m, NULL); return (0); } } if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know the * sequence numbers haven't wrapped. This is a partial fix for the * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } tp->snd_wnd = tiwin; /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (IS_FASTOPEN(tp->t_flags)) { cc_conn_init(tp); } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } /* * ok for the first time in lets see if we can use the ts to figure * out what the initial RTT was. */ if ((to->to_flags & TOF_TS) != 0) { uint32_t t, rtt; t = tcp_tv_to_mssectick(&bbr->rc_tv); if (TSTMP_GEQ(t, to->to_tsecr)) { rtt = t - to->to_tsecr; if (rtt == 0) { rtt = 1; } rtt *= MS_IN_USEC; tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime); } } /* Drop off any SYN in the send map (probably not there) */ if (thflags & TH_ACK) bbr_log_syn(tp, to); if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; /* * Account for the ACK of our SYN prior to regular * ACK processing below. */ tp->snd_una++; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); } /* * If segment contains data or ACK, will call tcp_reass() later; if * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (tp->t_state == TCPS_FIN_WAIT_1) { /* We could have went to FIN_WAIT_1 (or EST) above */ /* * In FIN_WAIT_1 STATE in addition to the processing for the * ESTABLISHED state if our FIN is now acknowledged then * enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then closing * user can proceed. Starting the timer is contrary * to the specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { struct tcp_bbr *bbr; int32_t ret_val; /* * Header prediction: check for the two common cases of a * uni-directional data xfer. If the packet has no control flags, * is in-sequence, the window didn't change and we're not * retransmitting, it's a candidate. If the length is zero and the * ack moved forward, we're the sender side of the xfer. Just free * the data acked & wake any higher level process that was blocked * waiting for space. If the length is non-zero and the ack didn't * move, we're the receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data toc The socket * buffer and note that we need a delayed ack. Make sure that the * hidden state-flags are also off. Since we check for * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) { /* * If we have delived under 4 segments increase the initial * window if raised by the peer. We use this to determine * dynamic and static rwnd's at the end of a connection. */ bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd); } if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && __predict_true(SEGQ_EMPTY(tp)) && __predict_true(th->th_seq == tp->rcv_nxt)) { if (tlen == 0) { if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt)) { return (0); } } else { if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt)) { return (0); } } } ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (bbr_progress_timeout_check(bbr)) { ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } /* State changes only happen in bbr_process_data() */ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { struct tcp_bbr *bbr; int32_t ret_val; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (bbr_progress_timeout_check(bbr)) { ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static int bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr, struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so) { if (bbr->rc_allow_data_af_clo == 0) { close_now: tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) goto close_now; /* Ok we allow data that is ignored and a followup reset */ tp->rcv_nxt = th->th_seq + *tlen; tp->t_flags2 |= TF2_DROP_AF_DATA; bbr->r_wanted_output = 1; *tlen = 0; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { /* * If we can't receive any more data, then closing user can * proceed. Starting the timer is contrary to the * specification, but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { if (bbr_progress_timeout_check(bbr)) { ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return (1); } if (sbavail(&so->so_snd)) { if (bbr_progress_timeout_check(bbr)) { ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * case TCPS_LAST_ACK: Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { tp = tcp_close(tp); ctf_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (bbr_progress_timeout_check(bbr)) { ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then we may RST the other end depending on the outcome * of bbr_check_data_after_close. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ INP_WLOCK_ASSERT(tp->t_inpcb); if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ INP_WLOCK_ASSERT(tp->t_inpcb); if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (bbr_progress_timeout_check(bbr)) { ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } INP_WLOCK_ASSERT(tp->t_inpcb); return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static void bbr_stop_all_timers(struct tcpcb *tp) { struct tcp_bbr *bbr; /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rc_in_persist = 1; } tcp_timer_suspend(tp, TT_PERSIST); tcp_timer_suspend(tp, TT_REXMT); tcp_timer_suspend(tp, TT_KEEP); tcp_timer_suspend(tp, TT_DELACK); } static void bbr_google_mode_on(struct tcp_bbr *bbr) { bbr->rc_use_google = 1; bbr->rc_no_pacing = 0; bbr->r_ctl.bbr_google_discount = bbr_google_discount; bbr->r_use_policer = bbr_policer_detection_enabled; bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); bbr->bbr_use_rack_cheat = 0; bbr->r_ctl.rc_incr_tmrs = 0; bbr->r_ctl.rc_inc_tcp_oh = 0; bbr->r_ctl.rc_inc_ip_oh = 0; bbr->r_ctl.rc_inc_enet_oh = 0; reset_time(&bbr->r_ctl.rc_delrate, BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); reset_time_small(&bbr->r_ctl.rc_rttprop, (11 * USECS_IN_SECOND)); tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); } static void bbr_google_mode_off(struct tcp_bbr *bbr) { bbr->rc_use_google = 0; bbr->r_ctl.bbr_google_discount = 0; bbr->no_pacing_until = bbr_no_pacing_until; bbr->r_use_policer = 0; if (bbr->no_pacing_until) bbr->rc_no_pacing = 1; else bbr->rc_no_pacing = 0; if (bbr_use_rack_resend_cheat) bbr->bbr_use_rack_cheat = 1; else bbr->bbr_use_rack_cheat = 0; if (bbr_incr_timers) bbr->r_ctl.rc_incr_tmrs = 1; else bbr->r_ctl.rc_incr_tmrs = 0; if (bbr_include_tcp_oh) bbr->r_ctl.rc_inc_tcp_oh = 1; else bbr->r_ctl.rc_inc_tcp_oh = 0; if (bbr_include_ip_oh) bbr->r_ctl.rc_inc_ip_oh = 1; else bbr->r_ctl.rc_inc_ip_oh = 0; if (bbr_include_enet_oh) bbr->r_ctl.rc_inc_enet_oh = 1; else bbr->r_ctl.rc_inc_enet_oh = 0; bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; reset_time(&bbr->r_ctl.rc_delrate, bbr_num_pktepo_for_del_limit); reset_time_small(&bbr->r_ctl.rc_rttprop, (bbr_filter_len_sec * USECS_IN_SECOND)); tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); } /* * Return 0 on success, non-zero on failure * which indicates the error (usually no memory). */ static int bbr_init(struct tcpcb *tp) { struct tcp_bbr *bbr = NULL; struct inpcb *inp; uint32_t cts; tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO)); if (tp->t_fb_ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recusive (happens during setup. So a * scheme to drop the locks fails :( * */ return (ENOMEM); } bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rtt_valid = 0; inp = tp->t_inpcb; inp->inp_flags2 |= INP_CANNOT_DO_ECN; inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; TAILQ_INIT(&bbr->r_ctl.rc_map); TAILQ_INIT(&bbr->r_ctl.rc_free); TAILQ_INIT(&bbr->r_ctl.rc_tmap); bbr->rc_tp = tp; if (tp->t_inpcb) { bbr->rc_inp = tp->t_inpcb; } cts = tcp_get_usecs(&bbr->rc_tv); tp->t_acktime = 0; bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close; bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade; bbr->rc_tlp_threshold = bbr_tlp_thresh; bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh; bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay; bbr->r_ctl.rc_min_to = bbr_min_to; bbr->rc_bbr_state = BBR_STATE_STARTUP; bbr->r_ctl.bbr_lost_at_state = 0; bbr->r_ctl.rc_lost_at_startup = 0; bbr->rc_all_timers_stopped = 0; bbr->r_ctl.rc_bbr_lastbtlbw = 0; bbr->r_ctl.rc_pkt_epoch_del = 0; bbr->r_ctl.rc_pkt_epoch = 0; bbr->r_ctl.rc_lowest_rtt = 0xffffffff; bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain; bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; bbr->r_ctl.rc_went_idle_time = cts; bbr->rc_pacer_started = cts; bbr->r_ctl.rc_pkt_epoch_time = cts; bbr->r_ctl.rc_rcvtime = cts; bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.rc_del_time = cts; bbr->r_ctl.rc_tlp_rxt_last_time = cts; bbr->r_ctl.last_in_probertt = cts; bbr->skip_gain = 0; bbr->gain_is_limited = 0; bbr->no_pacing_until = bbr_no_pacing_until; if (bbr->no_pacing_until) bbr->rc_no_pacing = 1; if (bbr_use_google_algo) { bbr->rc_no_pacing = 0; bbr->rc_use_google = 1; bbr->r_ctl.bbr_google_discount = bbr_google_discount; bbr->r_use_policer = bbr_policer_detection_enabled; } else { bbr->rc_use_google = 0; bbr->r_ctl.bbr_google_discount = 0; bbr->r_use_policer = 0; } if (bbr_ts_limiting) bbr->rc_use_ts_limit = 1; else bbr->rc_use_ts_limit = 0; if (bbr_ts_can_raise) bbr->ts_can_raise = 1; else bbr->ts_can_raise = 0; if (V_tcp_delack_enabled == 1) tp->t_delayed_ack = 2; else if (V_tcp_delack_enabled == 0) tp->t_delayed_ack = 0; else if (V_tcp_delack_enabled < 100) tp->t_delayed_ack = V_tcp_delack_enabled; else tp->t_delayed_ack = 2; if (bbr->rc_use_google == 0) bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; else bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms; bbr->rc_max_rto_sec = bbr_rto_max_sec; bbr->rc_init_win = bbr_def_init_win; if (tp->t_flags & TF_REQ_TSTMP) bbr->rc_last_options = TCP_TS_OVERHEAD; bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options; bbr->r_ctl.rc_high_rwnd = tp->snd_wnd; bbr->r_init_rtt = 1; counter_u64_add(bbr_flows_nohdwr_pacing, 1); if (bbr_allow_hdwr_pacing) bbr->bbr_hdw_pace_ena = 1; else bbr->bbr_hdw_pace_ena = 0; if (bbr_sends_full_iwnd) bbr->bbr_init_win_cheat = 1; else bbr->bbr_init_win_cheat = 0; bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max; bbr->r_ctl.rc_drain_pg = bbr_drain_gain; bbr->r_ctl.rc_startup_pg = bbr_high_gain; bbr->rc_loss_exit = bbr_exit_startup_at_loss; bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain; bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second; bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar; bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max; bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor; bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min; bbr->r_ctl.bbr_cross_over = bbr_cross_over; bbr->r_ctl.rc_rtt_shrinks = cts; if (bbr->rc_use_google) { setup_time_filter(&bbr->r_ctl.rc_delrate, FILTER_TYPE_MAX, BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); setup_time_filter_small(&bbr->r_ctl.rc_rttprop, FILTER_TYPE_MIN, (11 * USECS_IN_SECOND)); } else { setup_time_filter(&bbr->r_ctl.rc_delrate, FILTER_TYPE_MAX, bbr_num_pktepo_for_del_limit); setup_time_filter_small(&bbr->r_ctl.rc_rttprop, FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND)); } bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0); if (bbr_uses_idle_restart) bbr->rc_use_idle_restart = 1; else bbr->rc_use_idle_restart = 0; bbr->r_ctl.rc_bbr_cur_del_rate = 0; bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps; if (bbr_resends_use_tso) bbr->rc_resends_use_tso = 1; #ifdef NETFLIX_PEAKRATE tp->t_peakrate_thr = tp->t_maxpeakrate; #endif if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct bbr_sendmap *rsm; rsm = bbr_alloc(bbr); if (rsm == NULL) { uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; return (ENOMEM); } rsm->r_flags = BBR_OVERMAX; rsm->r_tim_lastsent[0] = cts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; rsm->r_end = tp->snd_max; rsm->r_dupack = 0; rsm->r_delivered = bbr->r_ctl.rc_delivered; rsm->r_ts_valid = 0; rsm->r_del_ack_ts = tp->ts_recent; rsm->r_del_time = cts; if (bbr->r_ctl.r_app_limited_until) rsm->r_app_limited = 1; else rsm->r_app_limited = 0; TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) rsm->r_bbr_state = bbr_state_val(bbr); else rsm->r_bbr_state = 8; } if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0)) bbr->bbr_use_rack_cheat = 1; if (bbr_incr_timers && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_incr_tmrs = 1; if (bbr_include_tcp_oh && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_inc_tcp_oh = 1; if (bbr_include_ip_oh && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_inc_ip_oh = 1; if (bbr_include_enet_oh && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_inc_enet_oh = 1; bbr_log_type_statechange(bbr, cts, __LINE__); if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_srtt)) { uint32_t rtt; rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); } /* announce the settings and state */ bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT); tcp_bbr_tso_size_check(bbr, cts); /* * Now call the generic function to start a timer. This will place * the TCB on the hptsi wheel if a timer is needed with appropriate * flags. */ bbr_stop_all_timers(tp); bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0); return (0); } /* * Return 0 if we can accept the connection. Return * non-zero if we can't handle the connection. A EAGAIN * means you need to wait until the connection is up. * a EADDRNOTAVAIL means we can never handle the connection * (no SACK). */ static int bbr_handoff_ok(struct tcpcb *tp) { if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { /* Sure no problem though it may not stick */ return (0); } if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) { /* * We really don't know you have to get to ESTAB or beyond * to tell. */ return (EAGAIN); } if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) { return (0); } /* * If we reach here we don't do SACK on this connection so we can * never do rack. */ return (EINVAL); } static void bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { uint32_t calc; struct tcp_bbr *bbr; struct bbr_sendmap *rsm; bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (bbr->r_ctl.crte) tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); bbr_log_flowend(bbr); bbr->rc_tp = NULL; if (tp->t_inpcb) { /* Backout any flags2 we applied */ tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN; tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; } if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_flows_whdwr_pacing, -1); else counter_u64_add(bbr_flows_nohdwr_pacing, -1); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); while (rsm) { TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); uma_zfree(bbr_zone, rsm); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); } rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); while (rsm) { TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); uma_zfree(bbr_zone, rsm); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); } calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd; if (calc > (bbr->r_ctl.rc_init_rwnd / 10)) BBR_STAT_INC(bbr_dynamic_rwnd); else BBR_STAT_INC(bbr_static_rwnd); bbr->r_ctl.rc_free_cnt = 0; uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } /* Make sure snd_nxt is correctly set */ tp->snd_nxt = tp->snd_max; } static void bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win) { switch (tp->t_state) { case TCPS_SYN_SENT: bbr->r_state = TCPS_SYN_SENT; bbr->r_substate = bbr_do_syn_sent; break; case TCPS_SYN_RECEIVED: bbr->r_state = TCPS_SYN_RECEIVED; bbr->r_substate = bbr_do_syn_recv; break; case TCPS_ESTABLISHED: bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd); bbr->r_state = TCPS_ESTABLISHED; bbr->r_substate = bbr_do_established; break; case TCPS_CLOSE_WAIT: bbr->r_state = TCPS_CLOSE_WAIT; bbr->r_substate = bbr_do_close_wait; break; case TCPS_FIN_WAIT_1: bbr->r_state = TCPS_FIN_WAIT_1; bbr->r_substate = bbr_do_fin_wait_1; break; case TCPS_CLOSING: bbr->r_state = TCPS_CLOSING; bbr->r_substate = bbr_do_closing; break; case TCPS_LAST_ACK: bbr->r_state = TCPS_LAST_ACK; bbr->r_substate = bbr_do_lastack; break; case TCPS_FIN_WAIT_2: bbr->r_state = TCPS_FIN_WAIT_2; bbr->r_substate = bbr_do_fin_wait_2; break; case TCPS_LISTEN: case TCPS_CLOSED: case TCPS_TIME_WAIT: default: break; }; } static void bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog) { /* * Now what state are we going into now? Is there adjustments * needed? */ int32_t old_state, old_gain; old_state = bbr_state_val(bbr); old_gain = bbr->r_ctl.rc_bbr_hptsi_gain; if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) { /* Save the lowest srtt we saw in our end of the sub-state */ bbr->rc_hit_state_1 = 0; if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff) bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state; } bbr->rc_bbr_substate++; if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) { /* Cycle back to first state-> gain */ bbr->rc_bbr_substate = 0; } if (bbr_state_val(bbr) == BBR_SUB_GAIN) { /* * We enter the gain(5/4) cycle (possibly less if * shallow buffer detection is enabled) */ if (bbr->skip_gain) { /* * Hardware pacing has set our rate to * the max and limited our b/w just * do level i.e. no gain. */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1]; } else if (bbr->gain_is_limited && bbr->bbr_hdrw_pacing && bbr->r_ctl.crte) { /* * We can't gain above the hardware pacing * rate which is less than our rate + the gain * calculate the gain needed to reach the hardware * pacing rate.. */ uint64_t bw, rate, gain_calc; bw = bbr_get_bw(bbr); rate = bbr->r_ctl.crte->rate; if ((rate > bw) && (((bw * (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) { gain_calc = (rate * BBR_UNIT) / bw; if (gain_calc < BBR_UNIT) gain_calc = BBR_UNIT; bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc; } else { bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; } } else bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) { bbr->r_ctl.rc_bbr_state_atflight = cts; } else bbr->r_ctl.rc_bbr_state_atflight = 0; } else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { bbr->rc_hit_state_1 = 1; bbr->r_ctl.rc_exta_time_gd = 0; bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (bbr_state_drain_2_tar) { bbr->r_ctl.rc_bbr_state_atflight = 0; } else bbr->r_ctl.rc_bbr_state_atflight = cts; bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN]; } else { /* All other cycles hit here 2-7 */ if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) { if (bbr_sub_drain_slam_cwnd && (bbr->rc_use_google == 0) && (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP)) bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) - bbr_get_rtt(bbr, BBR_RTT_PROP)); else bbr->r_ctl.rc_exta_time_gd = 0; if (bbr->r_ctl.rc_exta_time_gd) { bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd; /* Now chop up the time for each state (div by 7) */ bbr->r_ctl.rc_level_state_extra /= 7; if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) { /* Add a randomization */ bbr_randomize_extra_state_time(bbr); } } } bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)]; } if (bbr->rc_use_google) { bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); } bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; if (dolog) bbr_log_type_statechange(bbr, cts, line); if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { counter_u64_add(bbr_state_time[(old_state + 5)], time_in); } else { counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } } bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; bbr_set_state_target(bbr, __LINE__); if (bbr_sub_drain_slam_cwnd && (bbr->rc_use_google == 0) && (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { /* Slam down the cwnd */ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; if (bbr_sub_drain_app_limit) { /* Go app limited if we are on a long drain */ bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes))); } bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (bbr->rc_lt_use_bw) { /* In policed mode we clamp pacing_gain to BBR_UNIT */ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; } /* Google changes TSO size every cycle */ if (bbr->rc_use_google) tcp_bbr_tso_size_check(bbr, cts); bbr->r_ctl.gain_epoch = cts; bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch; } static void bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) { if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) && (google_allow_early_out == 1) && (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) { /* We have reached out target flight size possibly early */ goto change_state; } if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) { return; } if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) { /* * Must be a rttProp movement forward before * we can change states. */ return; } if (bbr_state_val(bbr) == BBR_SUB_GAIN) { /* * The needed time has passed but for * the gain cycle extra rules apply: * 1) If we have seen loss, we exit * 2) If we have not reached the target * we stay in GAIN (gain-to-target). */ if (google_consider_lost && losses) goto change_state; if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) { return; } } change_state: /* For gain we must reach our target, all others last 1 rttProp */ bbr_substate_change(bbr, cts, __LINE__, 1); } static void bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) { uint32_t flight, bbr_cur_cycle_time; if (bbr->rc_use_google) { bbr_set_probebw_google_gains(bbr, cts, losses); return; } if (cts == 0) { /* * Never alow cts to be 0 we * do this so we can judge if * we have set a timestamp. */ cts = 1; } if (bbr_state_is_pkt_epoch) bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); else bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP); if (bbr->r_ctl.rc_bbr_state_atflight == 0) { if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { flight = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) { /* Keep it slam down */ if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (bbr_sub_drain_app_limit) { /* Go app limited if we are on a long drain */ bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight); } } if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) && (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) || (flight >= bbr->r_ctl.flightsize_at_drain))) { /* * Still here after the same time as * the gain. We need to drain harder * for the next srtt. Reduce by a set amount * the gain drop is capped at DRAIN states * value (88). */ bbr->r_ctl.flightsize_at_drain = flight; if (bbr_drain_drop_mul && bbr_drain_drop_div && (bbr_drain_drop_mul < bbr_drain_drop_div)) { /* Use your specific drop value (def 4/5 = 20%) */ bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul; bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div; } else { /* You get drop of 20% */ bbr->r_ctl.rc_bbr_hptsi_gain *= 4; bbr->r_ctl.rc_bbr_hptsi_gain /= 5; } if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) { /* Reduce our gain again to the bottom */ bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); } bbr_log_exit_gain(bbr, cts, 4); /* * Extend out so we wait another * epoch before dropping again. */ bbr->r_ctl.gain_epoch = cts; } if (flight <= bbr->r_ctl.rc_target_at_state) { if (bbr_sub_drain_slam_cwnd && (bbr->rc_use_google == 0) && (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); bbr_log_exit_gain(bbr, cts, 3); } } else { /* Its a gain */ if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) { bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); goto change_state; } if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) || ((ctf_outstanding(bbr->rc_tp) + bbr->rc_tp->t_maxseg - 1) >= bbr->rc_tp->snd_wnd)) { bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); bbr_log_exit_gain(bbr, cts, 2); } } /** * We fall through and return always one of two things has * occured. * 1) We are still not at target * * 2) We reached the target and set rc_bbr_state_atflight * which means we no longer hit this block * next time we are called. */ return; } change_state: if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) return; if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) { /* Less than a full time-period has passed */ return; } if (bbr->r_ctl.rc_level_state_extra && (bbr_state_val(bbr) > BBR_SUB_DRAIN) && ((cts - bbr->r_ctl.rc_bbr_state_time) < (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { /* Less than a full time-period + extra has passed */ return; } if (bbr_gain_gets_extra_too && bbr->r_ctl.rc_level_state_extra && (bbr_state_val(bbr) == BBR_SUB_GAIN) && ((cts - bbr->r_ctl.rc_bbr_state_time) < (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { /* Less than a full time-period + extra has passed */ return; } bbr_substate_change(bbr, cts, __LINE__, 1); } static uint32_t bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain) { uint32_t mss, tar; if (bbr->rc_use_google) { /* Google just uses the cwnd target */ tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain); } else { mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); /* Get the base cwnd with gain rounded to a mss */ tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr), gain), mss); /* Make sure it is within our min */ if (tar < get_min_cwnd(bbr)) return (get_min_cwnd(bbr)); } return (tar); } static void bbr_set_state_target(struct tcp_bbr *bbr, int line) { uint32_t tar, meth; if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { /* Special case using old probe-rtt method */ tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); meth = 1; } else { /* Non-probe-rtt case and reduced probe-rtt */ if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) { /* For gain cycle we use the hptsi gain */ tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); meth = 2; } else if ((bbr_target_is_bbunit) || bbr->rc_use_google) { /* * If configured, or for google all other states * get BBR_UNIT. */ tar = bbr_get_a_state_target(bbr, BBR_UNIT); meth = 3; } else { /* * Or we set a target based on the pacing gain * for non-google mode and default (non-configured). * Note we don't set a target goal below drain (192). */ if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN]) { tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]); meth = 4; } else { tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); meth = 5; } } } bbr_log_set_of_state_target(bbr, tar, line, meth); bbr->r_ctl.rc_target_at_state = tar; } static void bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { /* Change to probe_rtt */ uint32_t time_in; bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain + bbr->r_ctl.rc_delivered); /* Setup so we force feed the filter */ if (bbr->rc_use_google || bbr_probertt_sets_rtt) bbr->rc_prtt_set_ts = 1; if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0); bbr->r_ctl.rc_rtt_shrinks = cts; bbr->r_ctl.last_in_probertt = cts; bbr->r_ctl.rc_probertt_srttchktim = cts; bbr->r_ctl.rc_bbr_state_time = cts; bbr->rc_bbr_state = BBR_STATE_PROBE_RTT; /* We need to force the filter to update */ if ((bbr_sub_drain_slam_cwnd) && bbr->rc_hit_state_1 && (bbr->rc_use_google == 0) && (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd) bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; } else bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; /* Update the lost */ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){ /* Set to the non-configurable default of 4 (PROBE_RTT_MIN) */ bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6); bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd; } else { /* * We bring it down slowly by using a hptsi gain that is * probably 75%. This will slowly float down our outstanding * without tampering with the cwnd. */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_set_state_target(bbr, __LINE__); if (bbr_prtt_slam_cwnd && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } } if (ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= bbr->r_ctl.rc_target_at_state) { /* We are at target */ bbr->r_ctl.rc_bbr_enters_probertt = cts; } else { /* We need to come down to reach target before our time begins */ bbr->r_ctl.rc_bbr_enters_probertt = 0; } bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch; BBR_STAT_INC(bbr_enter_probertt); bbr_log_exit_gain(bbr, cts, 0); bbr_log_type_statechange(bbr, cts, line); } static void bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts) { /* * Sanity check on probe-rtt intervals. * In crazy situations where we are competing * against new-reno flows with huge buffers * our rtt-prop interval could come to dominate * things if we can't get through a full set * of cycles, we need to adjust it. */ if (bbr_can_adjust_probertt && (bbr->rc_use_google == 0)) { uint16_t val = 0; uint32_t cur_rttp, fval, newval, baseval; /* Are we to small and go into probe-rtt to often? */ baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1)); cur_rttp = roundup(baseval, USECS_IN_SECOND); fval = bbr_filter_len_sec * USECS_IN_SECOND; if (bbr_is_ratio == 0) { if (fval > bbr_rtt_probe_limit) newval = cur_rttp + (fval - bbr_rtt_probe_limit); else newval = cur_rttp; } else { int mul; mul = fval / bbr_rtt_probe_limit; newval = cur_rttp * mul; } if (cur_rttp > bbr->r_ctl.rc_probertt_int) { bbr->r_ctl.rc_probertt_int = cur_rttp; reset_time_small(&bbr->r_ctl.rc_rttprop, newval); val = 1; } else { /* * No adjustments were made * do we need to shrink it? */ if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) { if (cur_rttp <= bbr_rtt_probe_limit) { /* * Things have calmed down lets * shrink all the way to default */ bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; reset_time_small(&bbr->r_ctl.rc_rttprop, (bbr_filter_len_sec * USECS_IN_SECOND)); cur_rttp = bbr_rtt_probe_limit; newval = (bbr_filter_len_sec * USECS_IN_SECOND); val = 2; } else { /* * Well does some adjustment make sense? */ if (cur_rttp < bbr->r_ctl.rc_probertt_int) { /* We can reduce interval time some */ bbr->r_ctl.rc_probertt_int = cur_rttp; reset_time_small(&bbr->r_ctl.rc_rttprop, newval); val = 3; } } } } if (val) bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val); } } static void bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* Exit probe-rtt */ if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) { tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } bbr_log_exit_gain(bbr, cts, 1); bbr->rc_hit_state_1 = 0; bbr->r_ctl.rc_rtt_shrinks = cts; bbr->r_ctl.last_in_probertt = cts; bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0); bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered); if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } if (bbr->rc_filled_pipe) { /* Switch to probe_bw */ bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; bbr_substate_change(bbr, cts, __LINE__, 0); bbr_log_type_statechange(bbr, cts, __LINE__); } else { /* Back to startup */ bbr->rc_bbr_state = BBR_STATE_STARTUP; bbr->r_ctl.rc_bbr_state_time = cts; /* * We don't want to give a complete free 3 * measurements until we exit, so we use * the number of pe's we were in probe-rtt * to add to the startup_epoch. That way * we will still retain the old state. */ bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt); bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; /* Make sure to use the lower pg when shifting back in */ if (bbr->r_ctl.rc_lost && bbr_use_lower_gain_in_startup && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; else bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; /* Probably not needed but set it anyway */ bbr_set_state_target(bbr, __LINE__); bbr_log_type_statechange(bbr, cts, __LINE__); bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0); } bbr_check_probe_rtt_limits(bbr, cts); } static int32_t inline bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts) { if ((bbr->rc_past_init_win == 1) && (bbr->rc_in_persist == 0) && (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) { return (1); } if (bbr_can_force_probertt && (bbr->rc_in_persist == 0) && (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { return (1); } return (0); } static int32_t bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t pkt_epoch) { uint64_t btlbw, gain; if (pkt_epoch == 0) { /* * Need to be on a pkt-epoch to continue. */ return (0); } btlbw = bbr_get_full_bw(bbr); gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; if (btlbw >= gain) { bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; } if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) return (1); bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); return(0); } static int32_t inline bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch) { /* Have we gained 25% in the last 3 packet based epoch's? */ uint64_t btlbw, gain; int do_exit; int delta, rtt_gain; if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { /* * This qualifies as a RTT_PROBE session since we drop the * data outstanding to nothing and waited more than * bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } if (bbr_should_enter_probe_rtt(bbr, cts)) { bbr_enter_probe_rtt(bbr, cts, __LINE__); return (0); } if (bbr->rc_use_google) return (bbr_google_startup(bbr, cts, pkt_epoch)); if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && (bbr_use_lower_gain_in_startup)) { /* Drop to a lower gain 1.5 x since we saw loss */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; } if (pkt_epoch == 0) { /* * Need to be on a pkt-epoch to continue. */ return (0); } if (bbr_rtt_gain_thresh) { /* * Do we allow a flow to stay * in startup with no loss and no * gain in rtt over a set threshold? */ if (bbr->r_ctl.rc_pkt_epoch_rtt && bbr->r_ctl.startup_last_srtt && (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) { delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt; rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt; } else rtt_gain = 0; if ((bbr->r_ctl.startup_last_srtt == 0) || (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt)) /* First time or new lower value */ bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt; if ((bbr->r_ctl.rc_lost == 0) && (rtt_gain < bbr_rtt_gain_thresh)) { /* * No loss, and we are under * our gain threhold for * increasing RTT. */ if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) bbr->r_ctl.rc_bbr_last_startup_epoch++; bbr_log_startup_event(bbr, cts, rtt_gain, delta, bbr->r_ctl.startup_last_srtt, 10); return (0); } } if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) && (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) && (!IN_RECOVERY(bbr->rc_tp->t_flags))) { /* * We only assess if we have a new measurment when * we have no loss and are not in recovery. * Drag up by one our last_startup epoch so we will hold * the number of non-gain we have already accumulated. */ if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) bbr->r_ctl.rc_bbr_last_startup_epoch++; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9); return (0); } /* Case where we reduced the lost (bad retransmit) */ if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost) bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count; btlbw = bbr_get_full_bw(bbr); if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower) gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; else gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; do_exit = 0; if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw) bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; if (btlbw >= gain) { bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; /* Update the lost so we won't exit in next set of tests */ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); } if ((bbr->rc_loss_exit && (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) && ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) { /* * If we had no gain, we had loss and that loss was above * our threshould, the rwnd is not constrained, and we have * had at least 3 packet epochs exit. Note that this is * switched off by sysctl. Google does not do this by the * way. */ if ((ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) { do_exit = 1; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4); } else { /* Just record an updated loss value */ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5); } } else bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) || do_exit) { /* Return 1 to exit the startup state. */ return (1); } /* Stay in startup */ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); return (0); } static void bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses) { /* * A tick occured in the rtt epoch do we need to do anything? */ #ifdef BBR_INVARIANTS if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) && (bbr->rc_bbr_state != BBR_STATE_DRAIN) && (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) && (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) { /* Debug code? */ panic("Unknown BBR state %d?\n", bbr->rc_bbr_state); } #endif if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { /* Do we exit the startup state? */ if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) { uint32_t time_in; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6); bbr->rc_filled_pipe = 1; bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } else time_in = 0; if (bbr->rc_no_pacing) bbr->rc_no_pacing = 0; bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg; bbr->rc_bbr_state = BBR_STATE_DRAIN; bbr_set_state_target(bbr, __LINE__); if ((bbr->rc_use_google == 0) && bbr_slam_cwnd_in_main_drain) { /* Here we don't have to worry about probe-rtt */ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; bbr_log_type_statechange(bbr, cts, __LINE__); if (ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= bbr->r_ctl.rc_target_at_state) { /* * Switch to probe_bw if we are already * there */ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr_substate_change(bbr, cts, __LINE__, 0); bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr_log_type_statechange(bbr, cts, __LINE__); } } } else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) { uint32_t inflight; struct tcpcb *tp; tp = bbr->rc_tp; inflight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (inflight >= bbr->r_ctl.rc_target_at_state) { /* We have reached a flight of the cwnd target */ bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_set_state_target(bbr, __LINE__); /* * Rig it so we don't do anything crazy and * start fresh with a new randomization. */ bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; bbr->rc_bbr_substate = BBR_SUB_LEVEL6; bbr_substate_change(bbr, cts, __LINE__, 1); } } else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) { /* Has in-flight reached the bdp (or less)? */ uint32_t inflight; struct tcpcb *tp; tp = bbr->rc_tp; inflight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if ((bbr->rc_use_google == 0) && bbr_slam_cwnd_in_main_drain && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { /* * Here we don't have to worry about probe-rtt * re-slam it, but keep it slammed down. */ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (inflight <= bbr->r_ctl.rc_target_at_state) { /* We have drained */ bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } if ((bbr->rc_use_google == 0) && bbr_slam_cwnd_in_main_drain && (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { /* Restore the cwnd */ tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } /* Setup probe-rtt has being done now RRS-HERE */ bbr->r_ctl.rc_rtt_shrinks = cts; bbr->r_ctl.last_in_probertt = cts; bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0); /* Randomly pick a sub-state */ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr_substate_change(bbr, cts, __LINE__, 0); bbr_log_type_statechange(bbr, cts, __LINE__); } } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) { uint32_t flight; flight = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered); if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { /* * We must keep cwnd at the desired MSS. */ bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } else if ((bbr_prtt_slam_cwnd) && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { /* Re-slam it */ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (bbr->r_ctl.rc_bbr_enters_probertt == 0) { /* Has outstanding reached our target? */ if (flight <= bbr->r_ctl.rc_target_at_state) { bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0); bbr->r_ctl.rc_bbr_enters_probertt = cts; /* If time is exactly 0, be 1usec off */ if (bbr->r_ctl.rc_bbr_enters_probertt == 0) bbr->r_ctl.rc_bbr_enters_probertt = 1; if (bbr->rc_use_google == 0) { /* * Restore any lowering that as occured to * reach here */ if (bbr->r_ctl.bbr_rttprobe_gain_val) bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; else bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; } } if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) && (bbr->rc_use_google == 0) && bbr->r_ctl.bbr_rttprobe_gain_val && (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) || (flight >= bbr->r_ctl.flightsize_at_drain))) { /* * We have doddled with our current hptsi * gain an srtt and have still not made it * to target, or we have increased our flight. * Lets reduce the gain by xx% * flooring the reduce at DRAIN (based on * mul/div) */ int red; bbr->r_ctl.flightsize_at_drain = flight; bbr->r_ctl.rc_probertt_srttchktim = cts; red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1); if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) { /* Reduce our gain again */ bbr->r_ctl.rc_bbr_hptsi_gain -= red; bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0); } else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) { /* one more chance before we give up */ bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0); } else { /* At the very bottom */ bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1); } } } if (bbr->r_ctl.rc_bbr_enters_probertt && (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) && ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) { /* Time to exit probe RTT normally */ bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts); } } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { /* * This qualifies as a RTT_PROBE session since we * drop the data outstanding to nothing and waited * more than bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } if (bbr_should_enter_probe_rtt(bbr, cts)) { bbr_enter_probe_rtt(bbr, cts, __LINE__); } else { bbr_set_probebw_gains(bbr, cts, losses); } } } static void bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses) { int32_t epoch = 0; if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { bbr_set_epoch(bbr, cts, line); /* At each epoch doe lt bw sampling */ epoch = 1; } bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses); } static int bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv) { int32_t thflags, retval; uint32_t cts, lcts; uint32_t tiwin; struct tcpopt to; struct tcp_bbr *bbr; struct bbr_sendmap *rsm; struct timeval ltv; int32_t did_out = 0; int32_t in_recovery; uint16_t nsegs; int32_t prev_state; uint32_t lost; nsegs = max(1, m->m_pkthdr.lro_nsegs); bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* add in our stats */ kern_prefetch(bbr, &prev_state); prev_state = 0; thflags = th->th_flags; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a * race. */ INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); if (m->m_flags & M_TSTMP) { /* Prefer the hardware timestamp if present */ struct timespec ts; mbuf_tstmp2timespec(m, &ts); bbr->rc_tv.tv_sec = ts.tv_sec; bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); } else if (m->m_flags & M_TSTMP_LRO) { /* Next the arrival timestamp */ struct timespec ts; mbuf_tstmp2timespec(m, &ts); bbr->rc_tv.tv_sec = ts.tv_sec; bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); } else { /* * Ok just get the current time. */ bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv); } /* * If echoed timestamp is later than the current time, fall back to * non RFC1323 RTT calculation. Normalize timestamp if syncookies * were used when this connection was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv))) to.to_tsecr = 0; } /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! */ if (bbr->r_state == 0) { /* * Process options only when we get SYN/ACK back. The SYN * case for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. XXX * this is traditional behavior, may need to be cleaned up. */ if (bbr->rc_inp == NULL) { bbr->rc_inp = tp->t_inpcb; } /* * We need to init rc_inp here since its not init'd when * bbr_init is called */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with the * next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if (to.to_flags & TOF_FASTOPEN) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * At this point we are at the initial call. Here we decide * if we are doing RACK or not. We do this by seeing if * TF_SACK_PERMIT is set, if not rack is *not* possible and * we switch to the default code. */ if ((tp->t_flags & TF_SACK_PERMIT) == 0) { /* Bail */ tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); return (1); } /* Set the flag */ bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack); } if (thflags & TH_ACK) { /* Track ack types */ if (to.to_flags & TOF_SACK) BBR_STAT_INC(bbr_acks_with_sacks); else BBR_STAT_INC(bbr_plain_acks); } /* * This is the one exception case where we set the rack state * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ if (bbr->r_state != tp->t_state) bbr_set_state(tp, bbr, tiwin); if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = bbr->r_state; bbr->rc_ack_was_delayed = 0; lost = bbr->r_ctl.rc_lost; bbr->rc_is_pkt_epoch_now = 0; if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) { /* Get the real time into lcts and figure the real delay */ lcts = tcp_get_usecs(<v); if (TSTMP_GT(lcts, cts)) { bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts; bbr->rc_ack_was_delayed = 1; if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay, bbr->r_ctl.highest_hdwr_delay)) bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay; } else { bbr->r_ctl.rc_ack_hdwr_delay = 0; bbr->rc_ack_was_delayed = 0; } } else { bbr->r_ctl.rc_ack_hdwr_delay = 0; bbr->rc_ack_was_delayed = 0; } bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { retval = 0; m_freem(m); goto done_with_input; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } in_recovery = IN_RECOVERY(tp->t_flags); if (tiwin > bbr->r_ctl.rc_high_rwnd) bbr->r_ctl.rc_high_rwnd = tiwin; #ifdef BBR_INVARIANTS if ((tp->t_inpcb->inp_flags & INP_DROPPED) || (tp->t_inpcb->inp_flags2 & INP_FREED)) { panic("tp:%p bbr:%p given a dropped inp:%p", tp, bbr, tp->t_inpcb); } #endif bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->rtt_valid = 0; if (to.to_flags & TOF_TS) { bbr->rc_ts_valid = 1; bbr->r_ctl.last_inbound_ts = to.to_tsval; } else { bbr->rc_ts_valid = 0; bbr->r_ctl.last_inbound_ts = 0; } retval = (*bbr->r_substate) (m, th, so, tp, &to, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt); #ifdef BBR_INVARIANTS if ((retval == 0) && (tp->t_inpcb == NULL)) { panic("retval:%d tp:%p t_inpcb:NULL state:%d", retval, tp, prev_state); } #endif if (nxt_pkt == 0) BBR_STAT_INC(bbr_rlock_left_ret0); else BBR_STAT_INC(bbr_rlock_left_ret1); if (retval == 0) { /* * If retval is 1 the tcb is unlocked and most likely the tp * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); tcp_bbr_xmit_timer_commit(bbr, tp, cts); if (bbr->rc_is_pkt_epoch_now) bbr_set_pktepoch(bbr, cts, __LINE__); bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost)); if (nxt_pkt == 0) { if (bbr->r_wanted_output != 0) { bbr->rc_output_starts_timer = 0; did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } else bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0); } if ((nxt_pkt == 0) && ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((V_tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)))) { /* * We could not send (probably in the hpts but * stopped the timer)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && (bbr->rc_inp->inp_in_hpts) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* * keep alive not needed if we are hptsi * output yet */ ; } else { if (bbr->rc_inp->inp_in_hpts) { tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && (TSTMP_GT(lcts, bbr->rc_pacer_started))) { uint32_t del; del = lcts - bbr->rc_pacer_started; if (bbr->r_ctl.rc_last_delay_val > del) { BBR_STAT_INC(bbr_force_timer_start); bbr->r_ctl.rc_last_delay_val -= del; bbr->rc_pacer_started = lcts; } else { /* We are late */ bbr->r_ctl.rc_last_delay_val = 0; BBR_STAT_INC(bbr_force_output); (void)tp->t_fb->tfb_tcp_output(tp); } } } bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val, 0); } } else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) { /* Do we have the correct timer running? */ bbr_timer_audit(tp, bbr, lcts, &so->so_snd); } /* Do we have a new state */ if (bbr->r_state != tp->t_state) bbr_set_state(tp, bbr, tiwin); done_with_input: bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out); if (did_out) bbr->r_wanted_output = 0; #ifdef BBR_INVARIANTS if (tp->t_inpcb == NULL) { panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", did_out, retval, tp, prev_state); } #endif } return (retval); } static void bbr_log_type_hrdwtso(struct tcpcb *tp, struct tcp_bbr *bbr, int len, int mod, int what_we_can_send) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; uint32_t cts; cts = tcp_get_usecs(&tv); bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->r_ctl.rc_pace_min_segs; log.u_bbr.flex2 = what_we_can_send; log.u_bbr.flex3 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = len; log.u_bbr.flex5 = 0; log.u_bbr.flex7 = mod; log.u_bbr.flex8 = 1; TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_HDWR_TLS, 0, 0, &log, false, &tv); } } static void bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; int retval; /* First lets see if we have old packets */ if (tp->t_in_pkt) { if (ctf_do_queued_segments(so, tp, 1)) { m_freem(m); return; } } if (m->m_flags & M_TSTMP_LRO) { tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; } else { /* Should not be should we kassert instead? */ tcp_get_usecs(&tv); } retval = bbr_do_segment_nounlock(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); if (retval == 0) INP_WUNLOCK(tp->t_inpcb); } /* * Return how much data can be sent without violating the * cwnd or rwnd. */ static inline uint32_t bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin, uint32_t avail, int32_t sb_offset, uint32_t cts) { uint32_t len; if (ctf_outstanding(tp) >= tp->snd_wnd) { /* We never want to go over our peers rcv-window */ len = 0; } else { uint32_t flight; flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (flight >= sendwin) { /* * We have in flight what we are allowed by cwnd (if * it was rwnd blocking it would have hit above out * >= tp->snd_wnd). */ return (0); } len = sendwin - flight; if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { /* We would send too much (beyond the rwnd) */ len = tp->snd_wnd - ctf_outstanding(tp); } if ((len + sb_offset) > avail) { /* * We don't have that much in the SB, how much is * there? */ len = avail - sb_offset; } } return (len); } static inline void bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) { #ifdef NETFLIX_STATS TCPSTAT_INC(tcps_sndpack_error); TCPSTAT_ADD(tcps_sndbyte_error, len); #endif } static inline void bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) { if (error) { bbr_do_error_accounting(tp, bbr, rsm, len, error); return; } if ((tp->t_flags & TF_FORCEDATA) && len == 1) { /* Window probe */ TCPSTAT_INC(tcps_sndprobe); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif } else if (rsm) { if (rsm->r_flags & BBR_TLP) { /* * TLP should not count in retran count, but in its * own bin */ #ifdef NETFLIX_STATS tp->t_sndtlppack++; tp->t_sndtlpbyte += len; TCPSTAT_INC(tcps_tlpresends); TCPSTAT_ADD(tcps_tlpresend_bytes, len); #endif } else { /* Retransmit */ tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif } /* * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is * sub-state */ counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len); if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) { /* Non probe_bw log in 1, 2, or 4. */ counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len); } else { /* * Log our probe state 3, and log also 5-13 to show * us the recovery sub-state for the send. This * means that 3 == (5+6+7+8+9+10+11+12+13) */ counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len); counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len); } /* Place in both 16's the totals of retransmitted */ counter_u64_add(bbr_state_lost[16], len); counter_u64_add(bbr_state_resend[16], len); /* Place in 17's the total sent */ counter_u64_add(bbr_state_resend[17], len); counter_u64_add(bbr_state_lost[17], len); } else { /* New sends */ TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); /* Place in 17's the total sent */ counter_u64_add(bbr_state_resend[17], len); counter_u64_add(bbr_state_lost[17], len); #ifdef STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } } static void bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level) { if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) { /* * Limit the cwnd to not be above N x the target plus whats * is outstanding. The target is based on the current b/w * estimate. */ uint32_t target; target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT); target += ctf_outstanding(tp); target *= bbr_target_cwnd_mult_limit; if (tp->snd_cwnd > target) tp->snd_cwnd = target; bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__); } } static int bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg) { /* * "adv" is the amount we could increase the window, taking into * account that we are limited by TCP_MAXWIN << tp->rcv_scale. */ uint32_t adv; int32_t oldwin; adv = min(recwin, TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old size * when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) return (0); if (adv >= (2 * maxseg) && (adv >= (so->so_rcv.sb_hiwat / 4) || recwin <= (so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * maxseg)) { return (1); } if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) return (1); return (0); } /* * Return 0 on success and a errno on failure to send. * Note that a 0 return may not mean we sent anything * if the TCB was on the hpts. A non-zero return * does indicate the error we got from ip[6]_output. */ static int bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) { struct socket *so; int32_t len; uint32_t cts; uint32_t recwin, sendwin; int32_t sb_offset; int32_t flags, abandon, error = 0; struct tcp_log_buffer *lgb = NULL; struct mbuf *m; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize = 0; uint32_t if_hw_tsomax = 0; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif struct tcp_bbr *bbr; struct tcphdr *th; #ifdef NETFLIX_TCPOUDP struct udphdr *udp = NULL; #endif u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef NETFLIX_TCPOUDP unsigned ulen; #endif uint32_t bbr_seq; uint32_t delay_calc=0; uint8_t doing_tlp = 0; uint8_t local_options; #ifdef BBR_INVARIANTS uint8_t doing_retran_from = 0; uint8_t picked_up_retran = 0; #endif uint8_t wanted_cookie = 0; uint8_t more_to_rxt=0; int32_t prefetch_so_done = 0; int32_t prefetch_rsm = 0; uint32_t what_we_can = 0; uint32_t tot_len = 0; uint32_t rtr_cnt = 0; uint32_t maxseg, pace_max_segs, p_maxseg; int32_t csum_flags; int32_t hw_tls; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif volatile int32_t sack_rxmit; struct bbr_sendmap *rsm = NULL; int32_t tso, mtu; int force_tso = 0; struct tcpopt to; int32_t slot = 0; struct inpcb *inp; struct sockbuf *sb; uint32_t hpts_calling; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif uint8_t app_limited = BBR_JR_SENT_DATA; uint8_t filled_all = 0; bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* We take a cache hit here */ memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); cts = tcp_tv_to_usectick(&bbr->rc_tv); inp = bbr->rc_inp; so = inp->inp_socket; sb = &so->so_snd; #ifdef KERN_TLS if (sb->sb_flags & SB_TLS_IFNET) hw_tls = 1; else #endif hw_tls = 0; kern_prefetch(sb, &maxseg); maxseg = tp->t_maxseg - bbr->rc_last_options; if (bbr_minseg(bbr) < maxseg) { tcp_bbr_tso_size_check(bbr, cts); } /* Remove any flags that indicate we are pacing on the inp */ pace_max_segs = bbr->r_ctl.rc_pace_max_segs; p_maxseg = min(maxseg, pace_max_segs); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif #ifdef INET6 if (bbr->r_state) { /* Use the cache line loaded if possible */ isipv6 = bbr->r_is_v6; } else { isipv6 = (inp->inp_vflag & INP_IPV6) != 0; } #endif if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && inp->inp_in_hpts) { /* * We are on the hpts for some timer but not hptsi output. * Possibly remove from the hpts so we can send/recv etc. */ if ((tp->t_flags & TF_ACKNOW) == 0) { /* * No immediate demand right now to send an ack, but * the user may have read, making room for new data * (a window update). If so we may want to cancel * whatever timer is running (KEEP/DEL-ACK?) and * continue to send out a window update. Or we may * have gotten more data into the socket buffer to * send. */ recwin = min(max(sbspace(&so->so_rcv), 0), TCP_MAXWIN << tp->rcv_scale); if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) && ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <= (tp->snd_max - tp->snd_una))) { /* * Nothing new to send and no window update * is needed to send. Lets just return and * let the timer-run off. */ return (0); } } tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); bbr_timer_cancel(bbr, __LINE__, cts); } if (bbr->r_ctl.rc_last_delay_val) { /* Calculate a rough delay for early escape to sending */ if (SEQ_GT(cts, bbr->rc_pacer_started)) delay_calc = cts - bbr->rc_pacer_started; if (delay_calc >= bbr->r_ctl.rc_last_delay_val) delay_calc -= bbr->r_ctl.rc_last_delay_val; else delay_calc = 0; } /* Mark that we have called bbr_output(). */ if ((bbr->r_timer_override) || (tp->t_flags & TF_FORCEDATA) || (tp->t_state < TCPS_ESTABLISHED)) { /* Timeouts or early states are exempt */ if (inp->inp_in_hpts) tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); } else if (inp->inp_in_hpts) { if ((bbr->r_ctl.rc_last_delay_val) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && delay_calc) { /* * We were being paced for output and the delay has * already exceeded when we were supposed to be * called, lets go ahead and pull out of the hpts * and call output. */ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1); bbr->r_ctl.rc_last_delay_val = 0; tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); } else if (tp->t_state == TCPS_CLOSED) { bbr->r_ctl.rc_last_delay_val = 0; tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); } else { /* * On the hpts, you shall not pass! even if ACKNOW * is on, we will when the hpts fires, unless of * course we are overdue. */ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } } bbr->rc_cwnd_limited = 0; if (bbr->r_ctl.rc_last_delay_val) { /* recalculate the real delay and deal with over/under */ if (SEQ_GT(cts, bbr->rc_pacer_started)) delay_calc = cts - bbr->rc_pacer_started; else delay_calc = 0; if (delay_calc >= bbr->r_ctl.rc_last_delay_val) /* Setup the delay which will be added in */ delay_calc -= bbr->r_ctl.rc_last_delay_val; else { /* * We are early setup to adjust * our slot time. */ uint64_t merged_val; bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc); bbr->r_agg_early_set = 1; if (bbr->r_ctl.rc_hptsi_agg_delay) { if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) { /* Nope our previous late cancels out the early */ bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; } else { bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay; bbr->r_ctl.rc_hptsi_agg_delay = 0; } } merged_val = bbr->rc_pacer_started; merged_val <<= 32; merged_val |= bbr->r_ctl.rc_last_delay_val; bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls, bbr->r_ctl.rc_agg_early, cts, delay_calc, merged_val, bbr->r_agg_early_set, 3); bbr->r_ctl.rc_last_delay_val = 0; BBR_STAT_INC(bbr_early); delay_calc = 0; } } else { /* We were not delayed due to hptsi */ if (bbr->r_agg_early_set) bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; delay_calc = 0; } if (delay_calc) { /* * We had a hptsi delay which means we are falling behind on * sending at the expected rate. Calculate an extra amount * of data we can send, if any, to put us back on track. */ if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay) bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff; else bbr->r_ctl.rc_hptsi_agg_delay += delay_calc; } sendwin = min(tp->snd_wnd, tp->snd_cwnd); if ((tp->snd_una == tp->snd_max) && (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && (sbavail(sb))) { /* * Ok we have been idle with nothing outstanding * we possibly need to start fresh with either a new * suite of states or a fast-ramp up. */ bbr_restart_after_idle(bbr, cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time)); } /* * Now was there a hptsi delay where we are behind? We only count * being behind if: a) We are not in recovery. b) There was a delay. * c) We had room to send something. * */ hpts_calling = inp->inp_hpts_calls; inp->inp_hpts_calls = 0; if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (bbr_process_timers(tp, bbr, cts, hpts_calling)) { counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1); return (0); } } bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; if (hpts_calling && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { bbr->r_ctl.rc_last_delay_val = 0; } bbr->r_timer_override = 0; bbr->r_wanted_output = 0; /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && ((tp->t_state == TCPS_SYN_RECEIVED) || (tp->t_state == TCPS_SYN_SENT)) && - SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN or SYN|ACK sent */ + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->t_rxtshift == 0)) { /* not a retransmit */ return (0); } /* * Before sending anything check for a state update. For hpts * calling without input this is important. If its input calling * then this was already done. */ if (bbr->rc_use_google == 0) bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_max. BBR in general does not pay much attention to snd_nxt * for historic reasons the persist timer still uses it. This means * we have to look at it. All retransmissions that are not persits * use the rsm that needs to be sent so snd_nxt is ignored. At the * end of this routine we pull snd_nxt always up to snd_max. */ doing_tlp = 0; #ifdef BBR_INVARIANTS doing_retran_from = picked_up_retran = 0; #endif error = 0; tso = 0; slot = 0; mtu = 0; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sb_offset = tp->snd_max - tp->snd_una; flags = tcp_outflags[tp->t_state]; sack_rxmit = 0; len = 0; rsm = NULL; if (flags & TH_RST) { SOCKBUF_LOCK(sb); goto send; } recheck_resend: while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { /* We need to always have one in reserve */ rsm = bbr_alloc(bbr); if (rsm == NULL) { error = ENOMEM; /* Lie to get on the hpts */ tot_len = tp->t_maxseg; if (hpts_calling) /* Retry in a ms */ slot = 1001; goto just_return_nolock; } TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); bbr->r_ctl.rc_free_cnt++; rsm = NULL; } /* What do we send, a resend? */ if (bbr->r_ctl.rc_resend == NULL) { /* Check for rack timeout */ bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); if (bbr->r_ctl.rc_resend) { #ifdef BBR_INVARIANTS picked_up_retran = 1; #endif bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend); } } if (bbr->r_ctl.rc_resend) { rsm = bbr->r_ctl.rc_resend; #ifdef BBR_INVARIANTS doing_retran_from = 1; #endif /* Remove any TLP flags its a RACK or T-O */ rsm->r_flags &= ~BBR_TLP; bbr->r_ctl.rc_resend = NULL; if (SEQ_LT(rsm->r_start, tp->snd_una)) { #ifdef BBR_INVARIANTS panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n", tp, bbr, rsm, rsm->r_start, tp->snd_una); goto recheck_resend; #else /* TSNH */ rsm = NULL; goto recheck_resend; #endif } rtr_cnt++; if (rsm->r_flags & BBR_HAS_SYN) { /* Only retransmit a SYN by itself */ len = 0; if ((flags & TH_SYN) == 0) { /* Huh something is wrong */ rsm->r_start++; if (rsm->r_start == rsm->r_end) { /* Clean it up, somehow we missed the ack? */ bbr_log_syn(tp, NULL); } else { /* TFO with data? */ rsm->r_flags &= ~BBR_HAS_SYN; len = rsm->r_end - rsm->r_start; } } else { /* Retransmitting SYN */ rsm = NULL; SOCKBUF_LOCK(sb); goto send; } } else len = rsm->r_end - rsm->r_start; if ((bbr->rc_resends_use_tso == 0) && #ifdef KERN_TLS ((sb->sb_flags & SB_TLS_IFNET) == 0) && #endif (len > maxseg)) { len = maxseg; more_to_rxt = 1; } sb_offset = rsm->r_start - tp->snd_una; if (len > 0) { sack_rxmit = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, maxseg)); } else { /* I dont think this can happen */ rsm = NULL; goto recheck_resend; } BBR_STAT_INC(bbr_resends_set); } else if (bbr->r_ctl.rc_tlp_send) { /* * Tail loss probe */ doing_tlp = 1; rsm = bbr->r_ctl.rc_tlp_send; bbr->r_ctl.rc_tlp_send = NULL; sack_rxmit = 1; len = rsm->r_end - rsm->r_start; rtr_cnt++; if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) len = maxseg; if (SEQ_GT(tp->snd_una, rsm->r_start)) { #ifdef BBR_INVARIANTS panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u", tp, bbr, tp->snd_una, rsm, rsm->r_start); #else /* TSNH */ rsm = NULL; goto recheck_resend; #endif } sb_offset = rsm->r_start - tp->snd_una; BBR_STAT_INC(bbr_tlp_set); } /* * Enforce a connection sendmap count limit if set * as long as we are not retransmiting. */ if ((rsm == NULL) && (V_tcp_map_entries_limit > 0) && (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { BBR_STAT_INC(bbr_alloc_limited); if (!bbr->alloc_limit_reported) { bbr->alloc_limit_reported = 1; BBR_STAT_INC(bbr_alloc_limited_conns); } goto just_return_nolock; } #ifdef BBR_INVARIANTS if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) { panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u", tp, bbr, rsm, sb_offset, len); } #endif /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN && (rsm == NULL)) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; if (rsm && (rsm->r_flags & BBR_HAS_FIN)) { /* we are retransmitting the fin */ len--; if (len) { /* * When retransmitting data do *not* include the * FIN. This could happen from a TLP probe if we * allowed data with a FIN. */ flags &= ~TH_FIN; } } else if (rsm) { if (flags & TH_FIN) flags &= ~TH_FIN; } if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { void *end_rsm; end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); if (end_rsm) kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } SOCKBUF_LOCK(sb); /* * If in persist timeout with window of 0, send 1 byte. Otherwise, * if window is small but nonzero and time TF_SENTFIN expired, we * will send what we can and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if ((sendwin == 0) || (sendwin <= (tp->snd_max - tp->snd_una))) { /* * If we still have some data to send, then clear * the FIN bit. Usually this would happen below * when it realizes that we aren't sending all the * data. However, if we have exactly 1 byte of * unsent data, then it won't clear the FIN bit * below, and if we are in persist state, we wind up * sending the packet without recording that we sent * the FIN bit. * * We can't just blindly clear the FIN bit, because * if we don't have any more data to send then the * probe will be the FIN itself. */ if (sb_offset < sbused(sb)) flags &= ~TH_FIN; sendwin = 1; } else { if ((bbr->rc_in_persist != 0) && (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr)))) { /* Exit persists if there is space */ bbr_exit_persist(tp, bbr, cts, __LINE__); } if (rsm == NULL) { /* * If we are dropping persist mode then we * need to correct sb_offset if not a * retransmit. */ sb_offset = tp->snd_max - tp->snd_una; } } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a * negative length. This can also occur when TCP opens up its * congestion window while receiving additional duplicate acks after * fast-retransmit because TCP will reset snd_nxt to snd_max after * the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will be * set to snd_una, the sb_offset will be 0, and the length may wind * up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { uint32_t avail; avail = sbavail(sb); if (SEQ_GT(tp->snd_max, tp->snd_una)) sb_offset = tp->snd_max - tp->snd_una; else sb_offset = 0; if (bbr->rc_tlp_new_data) { /* TLP is forcing out new data */ uint32_t tlplen; doing_tlp = 1; tlplen = maxseg; if (tlplen > (uint32_t)(avail - sb_offset)) { tlplen = (uint32_t)(avail - sb_offset); } if (tlplen > tp->snd_wnd) { len = tp->snd_wnd; } else { len = tlplen; } bbr->rc_tlp_new_data = 0; } else { what_we_can = len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts); if ((len < p_maxseg) && (bbr->rc_in_persist == 0) && (ctf_outstanding(tp) >= (2 * p_maxseg)) && ((avail - sb_offset) >= p_maxseg)) { /* * We are not completing whats in the socket * buffer (i.e. there is at least a segment * waiting to send) and we have 2 or more * segments outstanding. There is no sense * of sending a little piece. Lets defer and * and wait until we can send a whole * segment. */ len = 0; } if ((tp->t_flags & TF_FORCEDATA) && (bbr->rc_in_persist)) { /* * We are in persists, figure out if * a retransmit is available (maybe the previous * persists we sent) or if we have to send new * data. */ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm) { len = rsm->r_end - rsm->r_start; if (rsm->r_flags & BBR_HAS_FIN) len--; if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) len = maxseg; if (len > 1) BBR_STAT_INC(bbr_persist_reneg); /* * XXXrrs we could force the len to * 1 byte here to cause the chunk to * split apart.. but that would then * mean we always retransmit it as * one byte even after the window * opens. */ sack_rxmit = 1; sb_offset = rsm->r_start - tp->snd_una; } else { /* * First time through in persists or peer * acked our one byte. Though we do have * to have something in the sb. */ len = 1; sb_offset = 0; if (avail == 0) len = 0; } } } } if (prefetch_so_done == 0) { kern_prefetch(so, &prefetch_so_done); prefetch_so_done = 1; } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && (rsm == NULL) && SEQ_GT(tp->snd_max, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; sb_offset--, len++; if (sbavail(sb) == 0) len = 0; } else if ((flags & TH_SYN) && rsm) { /* * Subtract one from the len for the SYN being * retransmitted. */ len--; } /* * Be careful not to send data and/or FIN on SYN segments. This * measure is needed to prevent interoperability problems with not * fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * - When retransmitting SYN on an actively created socket * - When sending a zero-length cookie (cookie request) on an * actively created socket * - When the socket is in the CLOSED state (RST is being sent) */ if (IS_FASTOPEN(tp->t_flags) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) { len = 0; sack_rxmit = 0; rsm = NULL; } /* Without fast-open there should never be data sent on a SYN */ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) len = 0; if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been * called to retransmit, len will be < 0. Otherwise, window * shrank after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back to (closed) * window, and set the persist timer if it isn't already * going. If the window didn't close completely, just wait * for an ACK. * * We also do a general check here to ensure that we will * set the persist timer when we have data to send, but a * 0-byte window. This makes sure the persist timer is set * even if the packet hits one of the "goto send" lines * below. */ len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (tp->snd_una == tp->snd_max) && (sb_offset < (int)sbavail(sb))) { /* * Not enough room in the rwnd to send * a paced segment out. */ bbr_enter_persist(tp, bbr, cts, __LINE__); } } else if ((rsm == NULL) && (doing_tlp == 0) && (len < bbr->r_ctl.rc_pace_max_segs)) { /* * We are not sending a full segment for * some reason. Should we not send anything (think * sws or persists)? */ if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (len < (int)(sbavail(sb) - sb_offset))) { /* * Here the rwnd is less than * the pacing size, this is not a retransmit, * we are established and * the send is not the last in the socket buffer * lets not send, and possibly enter persists. */ len = 0; if (tp->snd_max == tp->snd_una) bbr_enter_persist(tp, bbr, cts, __LINE__); } else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) && (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && (len < (int)(sbavail(sb) - sb_offset)) && (len < bbr_minseg(bbr))) { /* * Here we are not retransmitting, and * the cwnd is not so small that we could * not send at least a min size (rxt timer * not having gone off), We have 2 segments or * more already in flight, its not the tail end * of the socket buffer and the cwnd is blocking * us from sending out minimum pacing segment size. * Lets not send anything. */ bbr->rc_cwnd_limited = 1; len = 0; } else if (((tp->snd_wnd - ctf_outstanding(tp)) < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && (len < (int)(sbavail(sb) - sb_offset)) && (TCPS_HAVEESTABLISHED(tp->t_state))) { /* * Here we have a send window but we have * filled it up and we can't send another pacing segment. * We also have in flight more than 2 segments * and we are not completing the sb i.e. we allow * the last bytes of the sb to go out even if * its not a full pacing segment. */ len = 0; } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * */ if (bbr->rc_in_persist && len && (rsm == NULL) && (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) { /* * We are in persist, not doing a retransmit and don't have enough space * yet to send a full TSO. So is it at the end of the sb * if so we need to send else nuke to 0 and don't send. */ int sbleft; if (sbavail(sb) > sb_offset) sbleft = sbavail(sb) - sb_offset; else sbleft = 0; if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) { /* not at end of sb lets not send */ len = 0; } } /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP * options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per * generated segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() * does the right thing below to provide length of just ip options * and thus checking for ipoptlen is enough to decide if ip options * are present. */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(inp); else #endif if (inp->inp_options) ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); #endif /* INET */ #endif /* IPSEC */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && (len > maxseg) && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && ipoptlen == 0) tso = 1; recwin = min(max(sbspace(&so->so_rcv), 0), TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) - This is the last * buffer in a write()/send() and we are either idle or running * NODELAY - we've timed out (e.g. persist timer) - we have more * then 1/2 the maximum send window's worth of data (receiver may be * limited the window size) - we need to retransmit */ if (rsm) goto send; if (len) { if (sack_rxmit) goto send; if (len >= p_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause us * to flush a buffer queued with moretocome. XXX * */ if (((tp->t_flags & TF_MORETOCOME) == 0) && /* normal case */ ((tp->t_flags & TF_NODELAY) || ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ goto send; } if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ goto send; } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { goto send; } } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read from * the receive buffer, no matter how small, causes a window update * to be sent. We also should avoid sending a flurry of window * updates when the socket buffer had queued a lot of data and the * application is doing small reads. * * Prevent a flurry of pointless window updates by only sending an * update when we can increase the advertized window by more than * 1/4th of the socket buffer capacity. When the buffer is getting * full or is very small be more aggressive and send an update * whenever we can increase by two mss sized segments. In all other * situations the ACK's to new incoming data will carry further * window increases. * * Don't send an independent window update if a delayed ACK is * pending (it will get piggy-backed on it) or the remote side * already has done a half-close and won't send more data. Skip * this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* Check to see if we should do a window update */ if (bbr_window_update_needed(tp, so, recwin, maxseg)) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) { goto send; } if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { goto send; } if (SEQ_GT(tp->snd_up, tp->snd_una)) { goto send; } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0)) { goto send; } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(sb); just_return_nolock: if (tot_len) slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) slot = 0; if (tot_len == 0) { if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >= tp->snd_wnd) { BBR_STAT_INC(bbr_rwnd_limited); app_limited = BBR_JR_RWND_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); if ((bbr->rc_in_persist == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd)) { /* No send window.. we must enter persist */ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); } } else if (ctf_outstanding(tp) >= sbavail(sb)) { BBR_STAT_INC(bbr_app_limited); app_limited = BBR_JR_APP_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) { BBR_STAT_INC(bbr_cwnd_limited); app_limited = BBR_JR_CWND_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes))); bbr->rc_cwnd_limited = 1; } else { BBR_STAT_INC(bbr_app_limited); app_limited = BBR_JR_APP_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); } bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; bbr->r_ctl.rc_last_delay_val = 0; } else if (bbr->rc_use_google == 0) bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); /* Are we app limited? */ if ((app_limited == BBR_JR_APP_LIMITED) || (app_limited == BBR_JR_RWND_LIMITED)) { /** * We are application limited. */ bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered); } if (tot_len == 0) counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1); tp->t_flags &= ~TF_FORCEDATA; /* Dont update the time if we did not send */ bbr->r_ctl.rc_last_delay_val = 0; bbr->rc_output_starts_timer = 1; bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; } return (error); send: if (doing_tlp == 0) { /* * Data not a TLP, and its not the rxt firing. If it is the * rxt firing, we want to leave the tlp_in_progress flag on * so we don't send another TLP. It has to be a rack timer * or normal send (response to acked data) to clear the tlp * in progress flag. */ bbr->rc_tlp_in_progress = 0; bbr->rc_tlp_rtx_out = 0; } else { /* * Its a TLP. */ bbr->rc_tlp_in_progress = 1; } bbr_timer_cancel(bbr, __LINE__, cts); if (rsm == NULL) { if (sbused(sb) > 0) { /* * This is sub-optimal. We only send a stand alone * FIN on its own segment. */ if (flags & TH_FIN) { flags &= ~TH_FIN; if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) { /* Lets not send this */ slot = 0; goto just_return; } } } } else { /* * We do *not* send a FIN on a retransmit if it has data. * The if clause here where len > 1 should never come true. */ if ((len > 0) && (((rsm->r_flags & BBR_HAS_FIN) == 0) && (flags & TH_FIN))) { flags &= ~TH_FIN; len--; } } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { if ((tp->snd_una == tp->snd_max) && (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { /* * This qualifies as a RTT_PROBE session since we * drop the data outstanding to nothing and waited * more than bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } if (len >= maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options unless TCP * set not to do any options. NOTE: we assume that the IP/TCP header * plus TCP options always fit in a single mbuf, leaving room for a * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) * + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else #endif hdrlen = sizeof(struct tcpiphdr); /* * Compute options for segment. We only have to care about SYN and * established connection segments. Options for SYN-ACK segments * are handled in TCP syncache. */ to.to_flags = 0; local_options = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { to.to_mss = tcp_mssopt(&inp->inp_inc); #ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; local_options += TCPOLEN_TIMESTAMP + 2; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_tv_to_mssectick(&bbr->rc_tv); /* Selective ACK's. */ if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += (optlen = tcp_addoptions(&to, opt)); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } #ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } #endif #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if (bbr->rc_last_options != local_options) { /* * Cache the options length this generally does not change * on a connection. We use this to calculate TSO. */ bbr->rc_last_options = local_options; } maxseg = tp->t_maxseg - (ipoptlen + optlen); p_maxseg = min(maxseg, pace_max_segs); /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we * cut off the tail of the segment. */ #ifdef KERN_TLS /* force TSO for so TLS offload can get mss */ if (sb->sb_flags & SB_TLS_IFNET) { force_tso = 1; } #endif if (len > maxseg) { if (len != 0 && (flags & TH_FIN)) { flags &= ~TH_FIN; } if (tso) { uint32_t moff; int32_t max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { len = max_len; } } /* * Prevent the last segment from being fractional * unless the send sockbuf can be emptied: */ if (((sb_offset + len) < sbavail(sb)) && (hw_tls == 0)) { moff = len % (uint32_t)maxseg; if (moff != 0) { len -= moff; } } /* * In case there are too many small fragments don't * use TSO: */ if (len <= maxseg) { len = maxseg; tso = 0; } } else { /* Not doing TSO */ if (optlen + ipoptlen >= tp->t_maxseg) { /* * Since we don't have enough space to put * the IP header chain and the TCP header in * one packet as required by RFC 7112, don't * send it. Also ensure that at least one * byte of the payload can be put into the * TCP segment. */ SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; sack_rxmit = 0; goto out; } len = maxseg; } } else { /* Not doing TSO */ if_hw_tsomaxsegcount = 0; tso = 0; } KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); #ifdef DIAGNOSTIC #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); #endif /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further * down. */ #ifdef BBR_INVARIANTS if (sack_rxmit) { if (SEQ_LT(rsm->r_start, tp->snd_una)) { panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u", rsm, tp, bbr, rsm->r_start, tp->snd_una); } } #endif KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); if ((len == 0) && (flags & TH_FIN) && (sbused(sb))) { /* * We have outstanding data, don't send a fin by itself!. */ slot = 0; goto just_return; } /* * Grab a header mbuf, attaching a copy of data to be transmitted, * and initialize the header from the template for sends on this * connection. */ if (len) { uint32_t moff; uint32_t orig_len; /* * We place a limit on sending with hptsi. */ if ((rsm == NULL) && len > pace_max_segs) len = pace_max_segs; if (len <= maxseg) tso = 0; #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { BBR_STAT_INC(bbr_failed_mbuf_aloc); bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); SOCKBUF_UNLOCK(sb); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf to the * sb_offset in the socket buffer chain. */ if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) { #ifdef BBR_INVARIANTS if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u", tp, bbr, len, sb_offset, sbavail(sb), rsm, doing_retran_from, picked_up_retran, doing_tlp); #endif /* * In this messed up situation we have two choices, * a) pretend the send worked, and just start timers * and what not (not good since that may lead us * back here a lot). b) Send the lowest segment * in the map. c) Drop the connection. Lets do * which if it continues to happen will lead to * via timeouts. */ BBR_STAT_INC(bbr_offset_recovery); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); sb_offset = 0; if (rsm == NULL) { sack_rxmit = 0; len = sbavail(sb); } else { sack_rxmit = 1; if (rsm->r_start != tp->snd_una) { /* * Things are really messed up, * is the only thing to do. */ BBR_STAT_INC(bbr_offset_drop); tcp_set_inp_to_drop(inp, EFAULT); return (0); } len = rsm->r_end - rsm->r_start; } if (len > sbavail(sb)) len = sbavail(sb); if (len > maxseg) len = maxseg; } mb = sbsndptr_noadv(sb, sb_offset, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); if (rsm == NULL) sbsndptr_adv(sb, mb, len); m->m_len += len; } else { struct sockbuf *msb; if (rsm) msb = NULL; else msb = sb; #ifdef BBR_INVARIANTS if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) { if (rsm) { panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ", tp, bbr, len, moff, sbavail(sb), rsm, tp->snd_una, rsm->r_flags, rsm->r_start, doing_retran_from, picked_up_retran, doing_tlp, sack_rxmit); } else { panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u", tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una); } } #endif orig_len = len; m->m_next = tcp_m_copym( #ifdef NETFLIX_COPY_ARGS tp, #endif mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, ((rsm == NULL) ? hw_tls : 0) #ifdef NETFLIX_COPY_ARGS , &filled_all #endif ); if (len <= maxseg && !force_tso) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(sb); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } #ifdef BBR_INVARIANTS if (tso && len < maxseg) { panic("tp:%p tso on, but len:%d < maxseg:%d", tp, len, maxseg); } if (tso && if_hw_tsomaxsegcount) { int32_t seg_cnt = 0; struct mbuf *foo; foo = m; while (foo) { seg_cnt++; foo = foo->m_next; } if (seg_cnt > if_hw_tsomaxsegcount) { panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount); } } #endif /* * If we're sending everything we've got, set PUSH. (This * will keep happy those implementations which only give * data to the user when a buffer fills or a PUSH comes in.) */ if (sb_offset + len == sbused(sb) && sbused(sb) && !(flags & TH_SYN)) { flags |= TH_PUSH; } SOCKBUF_UNLOCK(sb); } else { SOCKBUF_UNLOCK(sb); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { BBR_STAT_INC(bbr_failed_mbuf_aloc); bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); error = ENOBUFS; /* Fudge the send time since we could not send */ sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(sb); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else { #endif th = (struct tcphdr *)(ip6 + 1); #ifdef NETFLIX_TCPOUDP } #endif tcpip_fillheaders(inp, #ifdef NETFLIX_TCPOUDP tp->t_port, #endif ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, #ifdef NETFLIX_TCPOUDP tp->t_port, #endif ip, th); } /* * If we are doing retransmissions, then snd_nxt will not reflect * the first unsent octet. For ACK only packets, we do not want the * sequence number of the retransmitted packet, we want the sequence * number of the next unsent octet. So, if there is no data (and no * SYN or FIN), use snd_max instead of snd_nxt when filling in * ti_seq. But if we are in persist state, snd_max might reflect * one byte beyond the right edge of the window, so use snd_nxt in * that case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) { /* New data (including new persists) */ th->th_seq = htonl(tp->snd_max); bbr_seq = tp->snd_max; } else if (flags & TH_SYN) { /* Syn's always send from iss */ th->th_seq = htonl(tp->iss); bbr_seq = tp->iss; } else if (flags & TH_FIN) { if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) { /* * If we sent the fin already its 1 minus * snd_max */ th->th_seq = (htonl(tp->snd_max - 1)); bbr_seq = (tp->snd_max - 1); } else { /* First time FIN use snd_max */ th->th_seq = htonl(tp->snd_max); bbr_seq = tp->snd_max; } } else if (flags & TH_RST) { /* * For a Reset send the last cum ack in sequence * (this like any other choice may still generate a * challenge ack, if a ack-update packet is in * flight). */ th->th_seq = htonl(tp->snd_una); bbr_seq = tp->snd_una; } else { /* * len == 0 and not persist we use snd_max, sending * an ack unless we have sent the fin then its 1 * minus. */ /* * XXXRRS Question if we are in persists and we have * nothing outstanding to send and we have not sent * a FIN, we will send an ACK. In such a case it * might be better to send (tp->snd_una - 1) which * would force the peer to ack. */ if (tp->t_flags & TF_SENTFIN) { th->th_seq = htonl(tp->snd_max - 1); bbr_seq = (tp->snd_max - 1); } else { th->th_seq = htonl(tp->snd_max); bbr_seq = tp->snd_max; } } } else { /* All retransmits use the rsm to guide the send */ th->th_seq = htonl(rsm->r_start); bbr_seq = rsm->r_start; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. */ if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) && recwin < maxseg))) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (tp->rcv_adv - tp->rcv_nxt)) recwin = (tp->rcv_adv - tp->rcv_nxt); if (recwin > TCP_MAXWIN << tp->rcv_scale) recwin = TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a or * ) segment itself is never scaled. The case is * handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 * window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is * attempting to read more data than can be buffered prior to * transmitting on the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_max)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_max)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull the urgent * pointer to the left edge of the send window so that it * doesn't drift into the send window on sequence number * wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. NOTE: since TCP options buffer doesn't * point into mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ goto out; } } #endif /* * Put TCP length in extended header, and then checksum extended * header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ #ifdef NETFLIX_TCPOUDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { #endif csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); #ifdef NETFLIX_TCPOUDP } #endif } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { #ifdef NETFLIX_TCPOUDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { #endif csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); #ifdef NETFLIX_TCPOUDP } #endif /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ if (tso || force_tso) { KASSERT(force_tso || len > maxseg, ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg)); m->m_pkthdr.csum_flags |= CSUM_TSO; csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = maxseg; } KASSERT(len + hdrlen == m_length(m, NULL), ("%s: mbuf chain different than expected: %d + %u != %u", __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + * (th->th_off << 2) */ ); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* Log to the black box */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); /* Record info on type of transmission */ log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay; log.u_bbr.flex2 = (bbr->r_recovery_bw << 3); log.u_bbr.flex3 = maxseg; log.u_bbr.flex4 = delay_calc; /* Encode filled_all into the upper flex5 bit */ log.u_bbr.flex5 = bbr->rc_past_init_win; log.u_bbr.flex5 <<= 1; log.u_bbr.flex5 |= bbr->rc_no_pacing; log.u_bbr.flex5 <<= 29; if (filled_all) log.u_bbr.flex5 |= 0x80000000; log.u_bbr.flex5 |= tp->t_maxseg; log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr); /* lets poke in the low and the high here for debugging */ log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; if (rsm || sack_rxmit) { if (doing_tlp) log.u_bbr.flex8 = 2; else log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, &log, false, NULL, NULL, 0, tv); } else { lgb = NULL; } /* * Fill in IP length and desired time to live and send to IP level. * There should be a better way to handle ttl and tos; we could keep * them in the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. Also, * desired default hop limit might be changed via Neighbor * Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace * probes. ip6_output() will set it properly; it's supposed * to include the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, inp->in6p_outputopts, &inp->inp_route6, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) mtu = inp->inp_route6.ro_rt->rt_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (isipv6) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every * packet. This might not be the best thing to do according * to RFC3390 Section 2. However the tcp hostcache migitates * the problem so it affects only the first tcp connection * with a host. * * NB: Don't set DF on small MTU/MSS to have a safe * fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, inp->inp_options, &inp->inp_route, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) mtu = inp->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: if (lgb) { lgb->tlb_errno = error; lgb = NULL; } /* * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ if (error == 0) { if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) tcp_clean_dsack_blocks(tp); /* We sent an ack clear the bbr_segs_rcvd count */ bbr->output_error_seen = 0; bbr->oerror_cnt = 0; bbr->bbr_segs_rcvd = 0; if (len == 0) counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (hw_tls) { if (filled_all || (len >= bbr->r_ctl.rc_pace_max_segs)) BBR_STAT_INC(bbr_meets_tso_thresh); else { if (doing_tlp) { BBR_STAT_INC(bbr_miss_tlp); bbr_log_type_hrdwtso(tp, bbr, len, 1, what_we_can); } else if (rsm) { BBR_STAT_INC(bbr_miss_retran); bbr_log_type_hrdwtso(tp, bbr, len, 2, what_we_can); } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > sbavail(sb)) { BBR_STAT_INC(bbr_miss_tso_app); bbr_log_type_hrdwtso(tp, bbr, len, 3, what_we_can); } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_cwnd) { BBR_STAT_INC(bbr_miss_tso_cwnd); bbr_log_type_hrdwtso(tp, bbr, len, 4, what_we_can); } else if ((ctf_outstanding(tp) + bbr->r_ctl.rc_pace_max_segs) > tp->snd_wnd) { BBR_STAT_INC(bbr_miss_tso_rwnd); bbr_log_type_hrdwtso(tp, bbr, len, 5, what_we_can); } else { BBR_STAT_INC(bbr_miss_unknown); bbr_log_type_hrdwtso(tp, bbr, len, 6, what_we_can); } } } /* Do accounting for new sends */ if ((len > 0) && (rsm == NULL)) { int idx; if (tp->snd_una == tp->snd_max) { /* * Special case to match google, when * nothing is in flight the delivered * time does get updated to the current * time (see tcp_rate_bsd.c). */ bbr->r_ctl.rc_del_time = cts; } if (len >= maxseg) { idx = (len / maxseg) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1); else counter_u64_add(bbr_out_size[idx], 1); } else { /* smaller than a MSS */ idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options); if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV) idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1); counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1); } } } abandon = 0; /* * We must do the send accounting before we log the output, * otherwise the state of the rsm could change and we account to the * wrong bucket. */ if (len > 0) { bbr_do_send_accounting(tp, bbr, rsm, len, error); if (error == 0) { if (tp->snd_una == tp->snd_max) bbr->r_ctl.rc_tlp_rxt_last_time = cts; } } bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error, cts, mb, &abandon, rsm, 0, sb); if (abandon) { /* * If bbr_log_output destroys the TCB or sees a TH_RST being * sent we should hit this condition. */ return (0); } if (((tp->t_flags & TF_FORCEDATA) == 0) || (bbr->rc_in_persist == 0)) { /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ goto skip_upd; if (tp->snd_una == tp->snd_max && (len || (flags & (TH_SYN | TH_FIN)))) { /* * Update the time we just added data since none was * outstanding. */ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); bbr->rc_tp->t_acktime = ticks; } if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) { if (flags & TH_SYN) { tp->snd_max++; } if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { tp->snd_max++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit == 0) tp->snd_max += len; skip_upd: if ((error == 0) && len) tot_len += len; } else { /* Persists case */ int32_t xlen = len; if (error) goto nomore; if (flags & TH_SYN) ++xlen; if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (xlen && (tp->snd_una == tp->snd_max)) { /* * Update the time we just added data since none was * outstanding. */ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); bbr->rc_tp->t_acktime = ticks; } if (sack_rxmit == 0) tp->snd_max += xlen; tot_len += (len + optlen + ipoptlen); } nomore: if (error) { /* * Failures do not advance the seq counter above. For the * case of ENOBUFS we will fall out and become ack-clocked. * capping the cwnd at the current flight. * Everything else will just have to retransmit with the timer * (no pacer). */ SOCKBUF_UNLOCK_ASSERT(sb); BBR_STAT_INC(bbr_saw_oerr); /* Clear all delay/early tracks */ bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; bbr->output_error_seen = 1; if (bbr->oerror_cnt < 0xf) bbr->oerror_cnt++; if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) { /* drop the session */ tcp_set_inp_to_drop(inp, ENETDOWN); } switch (error) { case ENOBUFS: /* * Make this guy have to get ack's to send * more but lets make sure we don't * slam him below a T-O (1MSS). */ if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) - maxseg; if (tp->snd_cwnd < maxseg) tp->snd_cwnd = maxseg; } slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; BBR_STAT_INC(bbr_saw_enobuf); if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_hdwr_pacing_enobuf, 1); else counter_u64_add(bbr_nohdwr_pacing_enobuf, 1); /* * Here even in the enobuf's case we want to do our * state update. The reason being we may have been * called by the input function. If so we have had * things change. */ error = 0; goto enobufs; case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. If TSO was active we either got an * interface without TSO capabilits or TSO was * turned off. If we obtained mtu from ip_output() * then update it and try again. */ /* Turn on tracing (or try to) */ { int old_maxseg; old_maxseg = tp->t_maxseg; BBR_STAT_INC(bbr_saw_emsgsiz); bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts); if (mtu != 0) tcp_mss_update(tp, -1, mtu, NULL, NULL); if (old_maxseg <= tp->t_maxseg) { /* Huh it did not shrink? */ tp->t_maxseg = old_maxseg - 40; bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts); } tp->t_flags &= ~TF_FORCEDATA; /* * Nuke all other things that can interfere * with slot */ if ((tot_len + len) && (len >= tp->t_maxseg)) { slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, (tot_len + len), cts, 0); if (slot < bbr_error_base_paceout) slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; } else slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; bbr_start_hpts_timer(bbr, tp, cts, 10, slot, tot_len); return (error); } case EPERM: tp->t_softerror = error; /* Fall through */ case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; } /* FALLTHROUGH */ default: tp->t_flags &= ~TF_FORCEDATA; slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); return (error); } #ifdef STATS } else if (((tp->t_flags & TF_GPUTINPROG) == 0) && len && (rsm == NULL) && (bbr->rc_in_persist == 0)) { tp->gput_seq = bbr_seq; tp->gput_ack = bbr_seq + min(sbavail(&so->so_snd) - sb_offset, sendwin); tp->gput_ts = cts; tp->t_flags |= TF_GPUTINPROG; #endif } TCPSTAT_INC(tcps_sndtotal); if ((bbr->bbr_hdw_pace_ena) && (bbr->bbr_attempt_hdwr_pace == 0) && (bbr->rc_past_init_win) && (bbr->rc_bbr_state != BBR_STATE_STARTUP) && (get_filter_value(&bbr->r_ctl.rc_delrate)) && (inp->inp_route.ro_rt && inp->inp_route.ro_rt->rt_ifp)) { /* * We are past the initial window and * have at least one measurement so we * could use hardware pacing if its available. * We have an interface and we have not attempted * to setup hardware pacing, lets try to now. */ uint64_t rate_wanted; int err = 0; rate_wanted = bbr_get_hardware_rate(bbr); bbr->bbr_attempt_hdwr_pace = 1; bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp, inp->inp_route.ro_rt->rt_ifp, rate_wanted, (RS_PACING_GEQ|RS_PACING_SUB_OK), &err); if (bbr->r_ctl.crte) { bbr_type_log_hdwr_pacing(bbr, bbr->r_ctl.crte->ptbl->rs_ifp, rate_wanted, bbr->r_ctl.crte->rate, __LINE__, cts, err); BBR_STAT_INC(bbr_hdwr_rl_add_ok); counter_u64_add(bbr_flows_nohdwr_pacing, -1); counter_u64_add(bbr_flows_whdwr_pacing, 1); bbr->bbr_hdrw_pacing = 1; /* Now what is our gain status? */ if (bbr->r_ctl.crte->rate < rate_wanted) { /* We have a problem */ bbr_setup_less_of_rate(bbr, cts, bbr->r_ctl.crte->rate, rate_wanted); } else { /* We are good */ bbr->gain_is_limited = 0; bbr->skip_gain = 0; } tcp_bbr_tso_size_check(bbr, cts); } else { bbr_type_log_hdwr_pacing(bbr, inp->inp_route.ro_rt->rt_ifp, rate_wanted, 0, __LINE__, cts, err); BBR_STAT_INC(bbr_hdwr_rl_add_fail); } } if (bbr->bbr_hdrw_pacing) { /* * Worry about cases where the route * changes or something happened that we * lost our hardware pacing possibly during * the last ip_output call. */ if (inp->inp_snd_tag == NULL) { /* A change during ip output disabled hw pacing? */ bbr->bbr_hdrw_pacing = 0; } else if ((inp->inp_route.ro_rt == NULL) || (inp->inp_route.ro_rt->rt_ifp != inp->inp_snd_tag->ifp)) { /* * We had an interface or route change, * detach from the current hdwr pacing * and setup to re-attempt next go * round. */ bbr->bbr_hdrw_pacing = 0; bbr->bbr_attempt_hdwr_pace = 0; tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); tcp_bbr_tso_size_check(bbr, cts); } } /* * Data sent (as far as we can tell). If this advertises a larger * window than any other segment, then remember the size of the * advertised window. Any pending ACK has now been sent. */ if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; if ((error == 0) && (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) && (doing_tlp == 0) && (tso == 0) && (hw_tls == 0) && (len > 0) && ((flags & TH_RST) == 0) && (IN_RECOVERY(tp->t_flags) == 0) && (bbr->rc_in_persist == 0) && ((tp->t_flags & TF_FORCEDATA) == 0) && (tot_len < bbr->r_ctl.rc_pace_max_segs)) { /* * For non-tso we need to goto again until we have sent out * enough data to match what we are hptsi out every hptsi * interval. */ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; } if (rsm != NULL) { rsm = NULL; goto skip_again; } rsm = NULL; sack_rxmit = 0; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA); goto again; } skip_again: if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { /* * Calculate/Re-Calculate the hptsi slot in usecs based on * what we have sent so far */ slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) slot = 0; } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA); enobufs: if (bbr->rc_use_google == 0) bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes))); bbr->rc_output_starts_timer = 1; if (bbr->bbr_use_rack_cheat && (more_to_rxt || ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) { /* Rack cheats and shotguns out all rxt's 1ms apart */ if (slot > 1000) slot = 1000; } if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) { /* * We don't change the tso size until some number of sends * to give the hardware commands time to get down * to the interface. */ bbr->r_ctl.bbr_hdwr_cnt_noset_snt++; if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) { bbr->hw_pacing_set = 1; tcp_bbr_tso_size_check(bbr, cts); } } bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; } return (error); } /* * See bbr_output_wtime() for return values. */ static int bbr_output(struct tcpcb *tp) { int32_t ret; struct timeval tv; struct tcp_bbr *bbr; NET_EPOCH_ASSERT(); bbr = (struct tcp_bbr *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); (void)tcp_get_usecs(&tv); ret = bbr_output_wtime(tp, &tv); return (ret); } static void bbr_mtu_chg(struct tcpcb *tp) { struct tcp_bbr *bbr; struct bbr_sendmap *rsm, *frsm = NULL; uint32_t maxseg; /* * The MTU has changed. a) Clear the sack filter. b) Mark everything * over the current size as SACK_PASS so a retransmit will occur. */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; maxseg = tp->t_maxseg - bbr->rc_last_options; sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { /* Don't mess with ones acked (by sack?) */ if (rsm->r_flags & BBR_ACKED) continue; if ((rsm->r_end - rsm->r_start) > maxseg) { /* * We mark sack-passed on all the previous large * sends we did. This will force them to retransmit. */ rsm->r_flags |= BBR_SACK_PASSED; if (((rsm->r_flags & BBR_MARKED_LOST) == 0) && bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) { bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; rsm->r_flags |= BBR_MARKED_LOST; } if (frsm == NULL) frsm = rsm; } } if (frsm) { bbr->r_ctl.rc_resend = frsm; } } /* * bbr_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ static int bbr_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr) { int32_t error = 0, optval; switch (sopt->sopt_name) { case TCP_RACK_PACE_MAX_SEG: case TCP_RACK_MIN_TO: case TCP_RACK_REORD_THRESH: case TCP_RACK_REORD_FADE: case TCP_RACK_TLP_THRESH: case TCP_RACK_PKT_DELAY: case TCP_BBR_ALGORITHM: case TCP_BBR_TSLIMITS: case TCP_BBR_IWINTSO: case TCP_BBR_RECFORCE: case TCP_BBR_STARTUP_PG: case TCP_BBR_DRAIN_PG: case TCP_BBR_RWND_IS_APP: case TCP_BBR_PROBE_RTT_INT: case TCP_BBR_PROBE_RTT_GAIN: case TCP_BBR_PROBE_RTT_LEN: case TCP_BBR_STARTUP_LOSS_EXIT: case TCP_BBR_USEDEL_RATE: case TCP_BBR_MIN_RTO: case TCP_BBR_MAX_RTO: case TCP_BBR_PACE_PER_SEC: case TCP_DELACK: case TCP_BBR_PACE_DEL_TAR: case TCP_BBR_SEND_IWND_IN_TSO: case TCP_BBR_EXTRA_STATE: case TCP_BBR_UTTER_MAX_TSO: case TCP_BBR_MIN_TOPACEOUT: case TCP_BBR_FLOOR_MIN_TSO: case TCP_BBR_TSTMP_RAISES: case TCP_BBR_POLICER_DETECT: case TCP_BBR_USE_RACK_CHEAT: case TCP_DATA_AFTER_CLOSE: case TCP_BBR_HDWR_PACE: case TCP_BBR_PACE_SEG_MAX: case TCP_BBR_PACE_SEG_MIN: case TCP_BBR_PACE_CROSS: case TCP_BBR_PACE_OH: #ifdef NETFLIX_PEAKRATE case TCP_MAXPEAKRATE: #endif case TCP_BBR_TMR_PACE_OH: case TCP_BBR_RACK_RTT_USE: case TCP_BBR_RETRAN_WTSO: break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); bbr = (struct tcp_bbr *)tp->t_fb_ptr; switch (sopt->sopt_name) { case TCP_BBR_PACE_PER_SEC: BBR_OPTS_INC(tcp_bbr_pace_per_sec); bbr->r_ctl.bbr_hptsi_per_second = optval; break; case TCP_BBR_PACE_DEL_TAR: BBR_OPTS_INC(tcp_bbr_pace_del_tar); bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval; break; case TCP_BBR_PACE_SEG_MAX: BBR_OPTS_INC(tcp_bbr_pace_seg_max); bbr->r_ctl.bbr_hptsi_segments_max = optval; break; case TCP_BBR_PACE_SEG_MIN: BBR_OPTS_INC(tcp_bbr_pace_seg_min); bbr->r_ctl.bbr_hptsi_bytes_min = optval; break; case TCP_BBR_PACE_CROSS: BBR_OPTS_INC(tcp_bbr_pace_cross); bbr->r_ctl.bbr_cross_over = optval; break; case TCP_BBR_ALGORITHM: BBR_OPTS_INC(tcp_bbr_algorithm); if (optval && (bbr->rc_use_google == 0)) { /* Turn on the google mode */ bbr_google_mode_on(bbr); if ((optval > 3) && (optval < 500)) { /* * Must be at least greater than .3% * and must be less than 50.0%. */ bbr->r_ctl.bbr_google_discount = optval; } } else if ((optval == 0) && (bbr->rc_use_google == 1)) { /* Turn off the google mode */ bbr_google_mode_off(bbr); } break; case TCP_BBR_TSLIMITS: BBR_OPTS_INC(tcp_bbr_tslimits); if (optval == 1) bbr->rc_use_ts_limit = 1; else if (optval == 0) bbr->rc_use_ts_limit = 0; else error = EINVAL; break; case TCP_BBR_IWINTSO: BBR_OPTS_INC(tcp_bbr_iwintso); if ((optval >= 0) && (optval < 128)) { uint32_t twin; bbr->rc_init_win = optval; twin = bbr_initial_cwnd(bbr, tp); if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd)) tp->snd_cwnd = twin; else error = EBUSY; } else error = EINVAL; break; case TCP_BBR_STARTUP_PG: BBR_OPTS_INC(tcp_bbr_startup_pg); if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) { bbr->r_ctl.rc_startup_pg = optval; if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { bbr->r_ctl.rc_bbr_hptsi_gain = optval; } } else error = EINVAL; break; case TCP_BBR_DRAIN_PG: BBR_OPTS_INC(tcp_bbr_drain_pg); if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) bbr->r_ctl.rc_drain_pg = optval; else error = EINVAL; break; case TCP_BBR_PROBE_RTT_LEN: BBR_OPTS_INC(tcp_bbr_probertt_len); if (optval <= 1) reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND)); else error = EINVAL; break; case TCP_BBR_PROBE_RTT_GAIN: BBR_OPTS_INC(tcp_bbr_probertt_gain); if (optval <= BBR_UNIT) bbr->r_ctl.bbr_rttprobe_gain_val = optval; else error = EINVAL; break; case TCP_BBR_PROBE_RTT_INT: BBR_OPTS_INC(tcp_bbr_probe_rtt_int); if (optval > 1000) bbr->r_ctl.rc_probertt_int = optval; else error = EINVAL; break; case TCP_BBR_MIN_TOPACEOUT: BBR_OPTS_INC(tcp_bbr_topaceout); if (optval == 0) { bbr->no_pacing_until = 0; bbr->rc_no_pacing = 0; } else if (optval <= 0x00ff) { bbr->no_pacing_until = optval; if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) && (bbr->rc_bbr_state == BBR_STATE_STARTUP)){ /* Turn on no pacing */ bbr->rc_no_pacing = 1; } } else error = EINVAL; break; case TCP_BBR_STARTUP_LOSS_EXIT: BBR_OPTS_INC(tcp_bbr_startup_loss_exit); bbr->rc_loss_exit = optval; break; case TCP_BBR_USEDEL_RATE: error = EINVAL; break; case TCP_BBR_MIN_RTO: BBR_OPTS_INC(tcp_bbr_min_rto); bbr->r_ctl.rc_min_rto_ms = optval; break; case TCP_BBR_MAX_RTO: BBR_OPTS_INC(tcp_bbr_max_rto); bbr->rc_max_rto_sec = optval; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ BBR_OPTS_INC(tcp_rack_min_to); bbr->r_ctl.rc_min_to = optval; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ BBR_OPTS_INC(tcp_rack_reord_thresh); if ((optval > 0) && (optval < 31)) bbr->r_ctl.rc_reorder_shift = optval; else error = EINVAL; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ BBR_OPTS_INC(tcp_rack_reord_fade); bbr->r_ctl.rc_reorder_fade = optval; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ BBR_OPTS_INC(tcp_rack_tlp_thresh); if (optval) bbr->rc_tlp_threshold = optval; else error = EINVAL; break; case TCP_BBR_USE_RACK_CHEAT: BBR_OPTS_INC(tcp_use_rackcheat); if (bbr->rc_use_google) { error = EINVAL; break; } BBR_OPTS_INC(tcp_rack_cheat); if (optval) bbr->bbr_use_rack_cheat = 1; else bbr->bbr_use_rack_cheat = 0; break; case TCP_BBR_FLOOR_MIN_TSO: BBR_OPTS_INC(tcp_utter_max_tso); if ((optval >= 0) && (optval < 40)) bbr->r_ctl.bbr_hptsi_segments_floor = optval; else error = EINVAL; break; case TCP_BBR_UTTER_MAX_TSO: BBR_OPTS_INC(tcp_utter_max_tso); if ((optval >= 0) && (optval < 0xffff)) bbr->r_ctl.bbr_utter_max = optval; else error = EINVAL; break; case TCP_BBR_EXTRA_STATE: BBR_OPTS_INC(tcp_extra_state); if (optval) bbr->rc_use_idle_restart = 1; else bbr->rc_use_idle_restart = 0; break; case TCP_BBR_SEND_IWND_IN_TSO: BBR_OPTS_INC(tcp_iwnd_tso); if (optval) { bbr->bbr_init_win_cheat = 1; if (bbr->rc_past_init_win == 0) { uint32_t cts; cts = tcp_get_usecs(&bbr->rc_tv); tcp_bbr_tso_size_check(bbr, cts); } } else bbr->bbr_init_win_cheat = 0; break; case TCP_BBR_HDWR_PACE: BBR_OPTS_INC(tcp_hdwr_pacing); if (optval){ bbr->bbr_hdw_pace_ena = 1; bbr->bbr_attempt_hdwr_pace = 0; } else { bbr->bbr_hdw_pace_ena = 0; #ifdef RATELIMIT if (bbr->bbr_hdrw_pacing) { bbr->bbr_hdrw_pacing = 0; in_pcbdetach_txrtlmt(bbr->rc_inp); } #endif } break; case TCP_DELACK: BBR_OPTS_INC(tcp_delack); if (optval < 100) { if (optval == 0) /* off */ tp->t_delayed_ack = 0; else if (optval == 1) /* on which is 2 */ tp->t_delayed_ack = 2; else /* higher than 2 and less than 100 */ tp->t_delayed_ack = optval; if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; bbr_output(tp); } } else error = EINVAL; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ BBR_OPTS_INC(tcp_rack_pkt_delay); bbr->r_ctl.rc_pkt_delay = optval; break; #ifdef NETFLIX_PEAKRATE case TCP_MAXPEAKRATE: BBR_OPTS_INC(tcp_maxpeak); error = tcp_set_maxpeakrate(tp, optval); if (!error) tp->t_peakrate_thr = tp->t_maxpeakrate; break; #endif case TCP_BBR_RETRAN_WTSO: BBR_OPTS_INC(tcp_retran_wtso); if (optval) bbr->rc_resends_use_tso = 1; else bbr->rc_resends_use_tso = 0; break; case TCP_DATA_AFTER_CLOSE: BBR_OPTS_INC(tcp_data_ac); if (optval) bbr->rc_allow_data_af_clo = 1; else bbr->rc_allow_data_af_clo = 0; break; case TCP_BBR_POLICER_DETECT: BBR_OPTS_INC(tcp_policer_det); if (bbr->rc_use_google == 0) error = EINVAL; else if (optval) bbr->r_use_policer = 1; else bbr->r_use_policer = 0; break; case TCP_BBR_TSTMP_RAISES: BBR_OPTS_INC(tcp_ts_raises); if (optval) bbr->ts_can_raise = 1; else bbr->ts_can_raise = 0; break; case TCP_BBR_TMR_PACE_OH: BBR_OPTS_INC(tcp_pacing_oh_tmr); if (bbr->rc_use_google) { error = EINVAL; } else { if (optval) bbr->r_ctl.rc_incr_tmrs = 1; else bbr->r_ctl.rc_incr_tmrs = 0; } break; case TCP_BBR_PACE_OH: BBR_OPTS_INC(tcp_pacing_oh); if (bbr->rc_use_google) { error = EINVAL; } else { if (optval > (BBR_INCL_TCP_OH| BBR_INCL_IP_OH| BBR_INCL_ENET_OH)) { error = EINVAL; break; } if (optval & BBR_INCL_TCP_OH) bbr->r_ctl.rc_inc_tcp_oh = 1; else bbr->r_ctl.rc_inc_tcp_oh = 0; if (optval & BBR_INCL_IP_OH) bbr->r_ctl.rc_inc_ip_oh = 1; else bbr->r_ctl.rc_inc_ip_oh = 0; if (optval & BBR_INCL_ENET_OH) bbr->r_ctl.rc_inc_enet_oh = 1; else bbr->r_ctl.rc_inc_enet_oh = 0; } break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } #ifdef NETFLIX_STATS tcp_log_socket_option(tp, sopt->sopt_name, optval, error); #endif INP_WUNLOCK(inp); return (error); } /* * return 0 on success, error-num on failure */ static int bbr_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr) { int32_t error, optval; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever * add a option that is not a int, then this will have quite an * impact to this routine. */ switch (sopt->sopt_name) { case TCP_BBR_PACE_PER_SEC: optval = bbr->r_ctl.bbr_hptsi_per_second; break; case TCP_BBR_PACE_DEL_TAR: optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar; break; case TCP_BBR_PACE_SEG_MAX: optval = bbr->r_ctl.bbr_hptsi_segments_max; break; case TCP_BBR_MIN_TOPACEOUT: optval = bbr->no_pacing_until; break; case TCP_BBR_PACE_SEG_MIN: optval = bbr->r_ctl.bbr_hptsi_bytes_min; break; case TCP_BBR_PACE_CROSS: optval = bbr->r_ctl.bbr_cross_over; break; case TCP_BBR_ALGORITHM: optval = bbr->rc_use_google; break; case TCP_BBR_TSLIMITS: optval = bbr->rc_use_ts_limit; break; case TCP_BBR_IWINTSO: optval = bbr->rc_init_win; break; case TCP_BBR_STARTUP_PG: optval = bbr->r_ctl.rc_startup_pg; break; case TCP_BBR_DRAIN_PG: optval = bbr->r_ctl.rc_drain_pg; break; case TCP_BBR_PROBE_RTT_INT: optval = bbr->r_ctl.rc_probertt_int; break; case TCP_BBR_PROBE_RTT_LEN: optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND); break; case TCP_BBR_PROBE_RTT_GAIN: optval = bbr->r_ctl.bbr_rttprobe_gain_val; break; case TCP_BBR_STARTUP_LOSS_EXIT: optval = bbr->rc_loss_exit; break; case TCP_BBR_USEDEL_RATE: error = EINVAL; break; case TCP_BBR_MIN_RTO: optval = bbr->r_ctl.rc_min_rto_ms; break; case TCP_BBR_MAX_RTO: optval = bbr->rc_max_rto_sec; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ optval = bbr->r_ctl.rc_pace_max_segs; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ optval = bbr->r_ctl.rc_min_to; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ optval = bbr->r_ctl.rc_reorder_shift; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ optval = bbr->r_ctl.rc_reorder_fade; break; case TCP_BBR_USE_RACK_CHEAT: /* Do we use the rack cheat for rxt */ optval = bbr->bbr_use_rack_cheat; break; case TCP_BBR_FLOOR_MIN_TSO: optval = bbr->r_ctl.bbr_hptsi_segments_floor; break; case TCP_BBR_UTTER_MAX_TSO: optval = bbr->r_ctl.bbr_utter_max; break; case TCP_BBR_SEND_IWND_IN_TSO: /* Do we send TSO size segments initially */ optval = bbr->bbr_init_win_cheat; break; case TCP_BBR_EXTRA_STATE: optval = bbr->rc_use_idle_restart; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = bbr->rc_tlp_threshold; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ optval = bbr->r_ctl.rc_pkt_delay; break; case TCP_BBR_RETRAN_WTSO: optval = bbr->rc_resends_use_tso; break; case TCP_DATA_AFTER_CLOSE: optval = bbr->rc_allow_data_af_clo; break; case TCP_DELACK: optval = tp->t_delayed_ack; break; case TCP_BBR_HDWR_PACE: optval = bbr->bbr_hdw_pace_ena; break; case TCP_BBR_POLICER_DETECT: optval = bbr->r_use_policer; break; case TCP_BBR_TSTMP_RAISES: optval = bbr->ts_can_raise; break; case TCP_BBR_TMR_PACE_OH: optval = bbr->r_ctl.rc_incr_tmrs; break; case TCP_BBR_PACE_OH: optval = 0; if (bbr->r_ctl.rc_inc_tcp_oh) optval |= BBR_INCL_TCP_OH; if (bbr->r_ctl.rc_inc_ip_oh) optval |= BBR_INCL_IP_OH; if (bbr->r_ctl.rc_inc_enet_oh) optval |= BBR_INCL_ENET_OH; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); return (error); } /* * return 0 on success, error-num on failure */ static int bbr_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int32_t error = EINVAL; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (bbr == NULL) { /* Huh? */ goto out; } if (sopt->sopt_dir == SOPT_SET) { return (bbr_set_sockopt(so, sopt, inp, tp, bbr)); } else if (sopt->sopt_dir == SOPT_GET) { return (bbr_get_sockopt(so, sopt, inp, tp, bbr)); } out: INP_WUNLOCK(inp); return (error); } struct tcp_function_block __tcp_bbr = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = bbr_output, .tfb_do_queued_segments = ctf_do_queued_segments, .tfb_do_segment_nounlock = bbr_do_segment_nounlock, .tfb_tcp_do_segment = bbr_do_segment, .tfb_tcp_ctloutput = bbr_ctloutput, .tfb_tcp_fb_init = bbr_init, .tfb_tcp_fb_fini = bbr_fini, .tfb_tcp_timer_stop_all = bbr_stopall, .tfb_tcp_timer_activate = bbr_timer_activate, .tfb_tcp_timer_active = bbr_timer_active, .tfb_tcp_timer_stop = bbr_timer_stop, .tfb_tcp_rexmit_tmr = bbr_remxt_tmr, .tfb_tcp_handoff_ok = bbr_handoff_ok, .tfb_tcp_mtu_chg = bbr_mtu_chg }; static const char *bbr_stack_names[] = { __XSTRING(STACKNAME), #ifdef STACKALIAS __XSTRING(STACKALIAS), #endif }; static bool bbr_mod_inited = false; static int tcp_addbbr(module_t mod, int32_t type, void *data) { int32_t err = 0; int num_stacks; switch (type) { case MOD_LOAD: printf("Attempting to load " __XSTRING(MODNAME) "\n"); bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map", sizeof(struct bbr_sendmap), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", sizeof(struct tcp_bbr), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); sysctl_ctx_init(&bbr_sysctl_ctx); bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp), OID_AUTO, #ifdef STACKALIAS __XSTRING(STACKALIAS), #else __XSTRING(STACKNAME), #endif CTLFLAG_RW, 0, ""); if (bbr_sysctl_root == NULL) { printf("Failed to add sysctl node\n"); err = EFAULT; goto free_uma; } bbr_init_sysctls(); num_stacks = nitems(bbr_stack_names); err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK, bbr_stack_names, &num_stacks); if (err) { printf("Failed to register %s stack name for " "%s module\n", bbr_stack_names[num_stacks], __XSTRING(MODNAME)); sysctl_ctx_free(&bbr_sysctl_ctx); free_uma: uma_zdestroy(bbr_zone); uma_zdestroy(bbr_pcb_zone); bbr_counter_destroy(); printf("Failed to register " __XSTRING(MODNAME) " module err:%d\n", err); return (err); } tcp_lro_reg_mbufq(); bbr_mod_inited = true; printf(__XSTRING(MODNAME) " is now available\n"); break; case MOD_QUIESCE: err = deregister_tcp_functions(&__tcp_bbr, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_bbr, false, true); if (err == EBUSY) break; if (bbr_mod_inited) { uma_zdestroy(bbr_zone); uma_zdestroy(bbr_pcb_zone); sysctl_ctx_free(&bbr_sysctl_ctx); bbr_counter_destroy(); printf(__XSTRING(MODNAME) " is now no longer available\n"); bbr_mod_inited = false; } tcp_lro_dereg_mbufq(); err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t tcp_bbr = { .name = __XSTRING(MODNAME), .evhand = tcp_addbbr, .priv = 0 }; MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); Index: head/sys/netinet/tcp_stacks/tcp_bbr.h =================================================================== --- head/sys/netinet/tcp_stacks/tcp_bbr.h (revision 357663) +++ head/sys/netinet/tcp_stacks/tcp_bbr.h (revision 357664) @@ -1,832 +1,832 @@ /*- * Copyright (c) 2016-9 * Netflix Inc. All rights reserved. * Author Randall R. Stewart * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET_TCP_BBR_H_ #define _NETINET_TCP_BBR_H_ #define BBR_INITIAL_RTO 1000000 /* 1 second in micro-seconds */ /* Send map flags */ #define BBR_ACKED 0x0001 /* The remote endpoint acked this */ #define BBR_WAS_RENEGED 0x0002 /* The peer reneged the ack */ #define BBR_RXT_CLEARED 0x0004 /* ACK Cleared by the RXT timer */ #define BBR_OVERMAX 0x0008 /* We have more retran's then we can * fit */ #define BBR_SACK_PASSED 0x0010 /* A sack was done above this block */ #define BBR_WAS_SACKPASS 0x0020 /* We retransmitted due to SACK pass */ #define BBR_HAS_FIN 0x0040 /* segment is sent with fin */ #define BBR_TLP 0x0080 /* segment sent as tail-loss-probe */ #define BBR_HAS_SYN 0x0100 /* segment has the syn */ #define BBR_MARKED_LOST 0x0200 /* * This segments is lost and * totaled into bbr->rc_ctl.rc_lost */ #define BBR_RWND_COLLAPSED 0x0400 /* The peer collapsed the rwnd on the segment */ #define BBR_NUM_OF_RETRANS 7 /* Defines for socket options to set pacing overheads */ #define BBR_INCL_ENET_OH 0x01 #define BBR_INCL_IP_OH 0x02 #define BBR_INCL_TCP_OH 0x03 /* * With the addition of both measurement algorithms * I had to move over the size of a * cache line (unfortunately). For now there is * no way around this. We may be able to cut back * at some point I hope. */ struct bbr_sendmap { TAILQ_ENTRY(bbr_sendmap) r_next; /* seq number arrayed next */ TAILQ_ENTRY(bbr_sendmap) r_tnext; /* Time of tmit based next */ uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint32_t r_delivered; /* Delivered amount at send */ uint32_t r_del_time; /* The time of the last delivery update */ uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time * sent */ unused_bit:1, r_is_drain:1, /* In a draining cycle */ r_app_limited:1,/* We went app limited */ r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */ uint8_t r_dupack; /* Dup ack count */ uint8_t r_in_tmap:1, /* Flag to see if its in the r_tnext array */ r_is_smallmap:1,/* Was logged as a small-map send-map item */ r_is_gain:1, /* Was in gain cycle */ r_bbr_state:5; /* The BBR state at send */ uint8_t r_limit_type; /* is this entry counted against a limit? */ uint16_t r_flags; /* Flags as defined above */ uint16_t r_spare16; uint32_t r_del_ack_ts; /* At send what timestamp of peer was (if r_ts_valid set) */ /****************Cache line*****************/ uint32_t r_tim_lastsent[BBR_NUM_OF_RETRANS]; /* * Question, should we instead just grab the sending b/w * from the filter with the gain and store it in a * uint64_t instead? */ uint32_t r_first_sent_time; /* Time of first pkt in flight sent */ uint32_t r_pacing_delay; /* pacing delay of this send */ uint32_t r_flight_at_send; /* flight at the time of the send */ #ifdef _KERNEL } __aligned(CACHE_LINE_SIZE); #else }; #endif #define BBR_LIMIT_TYPE_SPLIT 1 TAILQ_HEAD(bbr_head, bbr_sendmap); #define BBR_SEGMENT_TIME_SIZE 1500 /* How many bytes in time_between */ #define BBR_MIN_SEG 1460 /* MSS size */ #define BBR_MAX_GAIN_VALUE 0xffff #define BBR_TIMER_FUDGE 1500 /* 1.5ms in micro seconds */ /* BW twiddle secret codes */ #define BBR_RED_BW_CONGSIG 0 /* We enter recovery and set using b/w */ #define BBR_RED_BW_RATECAL 1 /* We are calculating the loss rate */ #define BBR_RED_BW_USELRBW 2 /* We are dropping the lower b/w with * cDR */ #define BBR_RED_BW_SETHIGHLOSS 3 /* We have set our highloss value at * exit from probe-rtt */ #define BBR_RED_BW_PE_CLREARLY 4 /* We have decided to clear the * reduction early */ #define BBR_RED_BW_PE_CLAFDEL 5 /* We are clearing it on schedule * delayed */ #define BBR_RED_BW_REC_ENDCLL 6 /* Recover exits save high if needed * an clear to start measuring */ #define BBR_RED_BW_PE_NOEARLY_OUT 7 /* Set pkt epoch judged that we do not * get out of jail early */ /* For calculating a rate */ #define BBR_CALC_BW 1 #define BBR_CALC_LOSS 2 #define BBR_RTT_BY_TIMESTAMP 0 #define BBR_RTT_BY_EXACTMATCH 1 #define BBR_RTT_BY_EARLIER_RET 2 #define BBR_RTT_BY_THIS_RETRAN 3 #define BBR_RTT_BY_SOME_RETRAN 4 #define BBR_RTT_BY_TSMATCHING 5 /* Markers to track where we enter persists from */ #define BBR_PERSISTS_FROM_1 1 #define BBR_PERSISTS_FROM_2 2 #define BBR_PERSISTS_FROM_3 3 #define BBR_PERSISTS_FROM_4 4 #define BBR_PERSISTS_FROM_5 5 /* magic cookies to ask for the RTT */ #define BBR_RTT_PROP 0 #define BBR_RTT_RACK 1 #define BBR_RTT_PKTRTT 2 #define BBR_SRTT 3 #define BBR_SACKED 0 #define BBR_CUM_ACKED 1 /* threshold in useconds where we consider we need a higher min cwnd */ #define BBR_HIGH_SPEED 1000 #define BBR_HIGHSPEED_NUM_MSS 12 #define MAX_REDUCE_RXT 3 /* What is the maximum times we are willing to * reduce b/w in RTX's. Setting this has a * multiplicative effect e.g. if we are * reducing by 20% then setting it to 3 means * you will have reduced the b/w estimate by > * 60% before you stop. */ /* * We use the rate sample structure to * assist in single sack/ack rate and rtt * calculation. In the future we will expand * this in BBR to do forward rate sample * b/w estimation. */ #define BBR_RS_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */ #define BBR_RS_BW_EMPTY 0x00000002 /* Nothing yet stored in cDR */ #define BBR_RS_RTT_VALID 0x00000004 /* We have at least one valid RTT */ #define BBR_RS_BW_VAILD 0x00000008 /* We have a valid cDR */ #define BBR_RS_EMPTY (BBR_RS_RTT_EMPTY|BBR_RS_BW_EMPTY) struct bbr_rtt_sample { uint32_t rs_flags; uint32_t rs_rtt_lowest; uint32_t rs_rtt_lowest_sendtime; uint32_t rs_rtt_low_seq_start; uint32_t rs_rtt_highest; uint32_t rs_rtt_cnt; uint64_t rs_rtt_tot; uint32_t cur_rtt; uint32_t cur_rtt_bytecnt; uint32_t cur_rtt_rsmcnt; uint32_t rc_crtt_set:1, avail_bits:31; uint64_t rs_cDR; }; /* RTT shrink reasons */ #define BBR_RTTS_INIT 0 #define BBR_RTTS_NEWRTT 1 #define BBR_RTTS_RTTPROBE 2 #define BBR_RTTS_WASIDLE 3 #define BBR_RTTS_PERSIST 4 #define BBR_RTTS_REACHTAR 5 #define BBR_RTTS_ENTERPROBE 6 #define BBR_RTTS_SHRINK_PG 7 #define BBR_RTTS_SHRINK_PG_FINAL 8 #define BBR_RTTS_NEW_TARGET 9 #define BBR_RTTS_LEAVE_DRAIN 10 #define BBR_RTTS_RESETS_VALUES 11 #define BBR_NUM_RATES 5 /* Rate flags */ #define BBR_RT_FLAG_FREE 0x00 /* Is on the free list */ #define BBR_RT_FLAG_INUSE 0x01 /* Has been allocated */ #define BBR_RT_FLAG_READY 0x02 /* Ready to initiate a measurement. */ #define BBR_RT_FLAG_CAPPED_PRE 0x04 /* Ready to cap if we send the next segment */ #define BBR_RT_FLAG_CAPPED 0x08 /* Measurement is capped */ #define BBR_RT_FLAG_PASTFA 0x10 /* Past the first ack. */ #define BBR_RT_FLAG_LIMITED 0x20 /* Saw application/cwnd or rwnd limited period */ #define BBR_RT_SEEN_A_ACK 0x40 /* A ack has been saved */ #define BBR_RT_PREV_RTT_SET 0x80 /* There was a RTT set in */ #define BBR_RT_PREV_SEND_TIME 0x100 /* *There was a RTT send time set that can be used * no snd_limits */ #define BBR_RT_SET_GRADIENT 0x200 #define BBR_RT_TS_VALID 0x400 struct bbr_log { union { struct bbr_sendmap *rsm; /* For alloc/free */ uint64_t sb_acc; /* For out/ack or t-o */ }; struct tcpcb *tp; uint32_t t_flags; uint32_t th_seq; uint32_t th_ack; uint32_t snd_una; uint32_t snd_nxt; uint32_t snd_max; uint32_t snd_cwnd; uint32_t snd_wnd; uint32_t rc_lost; uint32_t target_cwnd; /* UU */ uint32_t inflight; /* UU */ uint32_t applimited; /* UU */ /* Things for BBR */ uint32_t delivered; /* UU */ uint64_t cur_del_rate; /* UU */ uint64_t delRate; /* UU */ uint64_t rttProp; /* UU */ uint64_t lt_bw; /* UU */ uint32_t timeStamp; uint32_t time; uint32_t slot; /* UU */ uint32_t delayed_by; uint32_t exp_del; uint32_t pkts_out; uint32_t new_win; uint32_t hptsi_gain; /* UU */ uint32_t cwnd_gain; /* UU */ uint32_t epoch; /* UU */ uint32_t lt_epoch; /* UU */ /* Sack fun */ uint32_t blk_start[4]; /* xx */ uint32_t blk_end[4]; uint32_t len; /* Timeout T3=1, TLP=2, RACK=3 */ uint8_t type; uint8_t n_sackblks; uint8_t applied; /* UU */ uint8_t inhpts; /* UU */ uint8_t ininput; /* UU */ uint8_t use_lt_bw; /* UU */ }; struct bbr_log_sysctl_out { uint32_t bbr_log_at; uint32_t bbr_log_max; struct bbr_log entries[0]; }; /* * Magic numbers for logging timeout events if the * logging is enabled. */ #define BBR_TO_FRM_TMR 1 #define BBR_TO_FRM_TLP 2 #define BBR_TO_FRM_RACK 3 #define BBR_TO_FRM_KEEP 4 #define BBR_TO_FRM_PERSIST 5 #define BBR_TO_FRM_DELACK 6 #define BBR_SEES_STRETCH_ACK 1 #define BBR_SEES_COMPRESSED_ACKS 2 /* * As we get each SACK we wade through the * rc_map and mark off what is acked. * We also increment rc_sacked as well. * * We also pay attention to missing entries * based on the time and possibly mark them * for retransmit. If we do and we are not already * in recovery we enter recovery. In doing * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec. * We also setup rc_next/rc_snd_nxt/rc_send_end so * we will know where to send from. When not in * recovery rc_next will be NULL and rc_snd_nxt should * equal snd_max. * * Whenever we retransmit from recovery we increment * rc_holes_rxt as we retran a block and mark it as retransmitted * with the time it was sent. During non-recovery sending we * add to our map and note the time down of any send expanding * the rc_map at the tail and moving rc_snd_nxt up with snd_max. * * In recovery during SACK/ACK processing if a chunk has * been retransmitted and it is now acked, we decrement rc_holes_rxt. * When we retransmit from the scoreboard we use * rc_next and rc_snd_nxt/rc_send_end to help us * find what needs to be retran. * * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt * This gets us the effect of RFC6675 pipe, counting twice for * bytes retransmitted. */ #define TT_BBR_FR_TMR 0x2001 #define BBR_SCALE 8 #define BBR_UNIT (1 << BBR_SCALE) #define BBR_NUM_RTTS_FOR_DEL_LIMIT 8 /* How many pkt-rtts do we keep * Delivery rate for */ #define BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT 10 /* How many pkt-rtts do we keep * Delivery rate for google */ #define BBR_SECONDS_NO_RTT 10 /* 10 seconds with no RTT shrinkage */ #define BBR_PROBERTT_MAX 200 /* 200ms */ #define BBR_PROBERTT_NUM_MSS 4 #define BBR_STARTUP_EPOCHS 3 #define USECS_IN_MSEC 1000 #define BBR_TIME_TO_SECONDS(a) (a / USECS_IN_SECOND) #define BBR_TIME_TO_MILLI(a) (a / MS_IN_USEC) /* BBR keeps time in usec's so we divide by 1000 and round up */ #define BBR_TS_TO_MS(t) ((t+999)/MS_IN_USEC) /* * Locking for the rack control block. * a) Locked by INP_WLOCK * b) Locked by the hpts-mutex * */ #define BBR_STATE_STARTUP 0x01 #define BBR_STATE_DRAIN 0x02 #define BBR_STATE_PROBE_BW 0x03 #define BBR_STATE_PROBE_RTT 0x04 #define BBR_STATE_IDLE_EXIT 0x05 /* Substate defines for STATE == PROBE_BW */ #define BBR_SUB_GAIN 0 /* State 0 where we are 5/4 BBR_UNIT */ #define BBR_SUB_DRAIN 1 /* State 1 where we are at 3/4 BBR_UNIT */ #define BBR_SUB_LEVEL1 2 /* State 1 first BBR_UNIT */ #define BBR_SUB_LEVEL2 3 /* State 2nd BBR_UNIT */ #define BBR_SUB_LEVEL3 4 /* State 3rd BBR_UNIT */ #define BBR_SUB_LEVEL4 5 /* State 4th BBR_UNIT */ #define BBR_SUB_LEVEL5 6 /* State 5th BBR_UNIT */ #define BBR_SUB_LEVEL6 7 /* State last BBR_UNIT */ #define BBR_SUBSTATE_COUNT 8 /* Single remaining reduce log */ #define BBR_REDUCE_AT_FR 5 #define BBR_BIG_LOG_SIZE 300000 struct bbr_stats { uint64_t bbr_badfr; /* 0 */ uint64_t bbr_badfr_bytes; /* 1 */ uint64_t bbr_saw_oerr; /* 2 */ uint64_t bbr_saw_emsgsiz; /* 3 */ uint64_t bbr_reorder_seen; /* 4 */ uint64_t bbr_tlp_tot; /* 5 */ uint64_t bbr_tlp_newdata; /* 6 */ uint64_t bbr_offset_recovery; /* 7 */ uint64_t bbr_tlp_retran_fail; /* 8 */ uint64_t bbr_to_tot; /* 9 */ uint64_t bbr_to_arm_rack; /* 10 */ uint64_t bbr_enter_probertt; /* 11 */ uint64_t bbr_tlp_set; /* 12 */ uint64_t bbr_resends_set; /* 13 */ uint64_t bbr_force_output; /* 14 */ uint64_t bbr_to_arm_tlp; /* 15 */ uint64_t bbr_paced_segments; /* 16 */ uint64_t bbr_saw_enobuf; /* 17 */ uint64_t bbr_to_alloc_failed; /* 18 */ uint64_t bbr_to_alloc_emerg; /* 19 */ uint64_t bbr_sack_proc_all; /* 20 */ uint64_t bbr_sack_proc_short; /* 21 */ uint64_t bbr_sack_proc_restart; /* 22 */ uint64_t bbr_to_alloc; /* 23 */ uint64_t bbr_offset_drop; /* 24 */ uint64_t bbr_runt_sacks; /* 25 */ uint64_t bbr_sack_passed; /* 26 */ uint64_t bbr_rlock_left_ret0; /* 27 */ uint64_t bbr_rlock_left_ret1; /* 28 */ uint64_t bbr_dynamic_rwnd; /* 29 */ uint64_t bbr_static_rwnd; /* 30 */ uint64_t bbr_sack_blocks; /* 31 */ uint64_t bbr_sack_blocks_skip; /* 32 */ uint64_t bbr_sack_search_both; /* 33 */ uint64_t bbr_sack_search_fwd; /* 34 */ uint64_t bbr_sack_search_back; /* 35 */ uint64_t bbr_plain_acks; /* 36 */ uint64_t bbr_acks_with_sacks; /* 37 */ uint64_t bbr_progress_drops; /* 38 */ uint64_t bbr_early; /* 39 */ uint64_t bbr_reneges_seen; /* 40 */ uint64_t bbr_persist_reneg; /* 41 */ uint64_t bbr_dropped_af_data; /* 42 */ uint64_t bbr_failed_mbuf_aloc; /* 43 */ uint64_t bbr_cwnd_limited; /* 44 */ uint64_t bbr_rwnd_limited; /* 45 */ uint64_t bbr_app_limited; /* 46 */ uint64_t bbr_force_timer_start; /* 47 */ uint64_t bbr_hpts_min_time; /* 48 */ uint64_t bbr_meets_tso_thresh; /* 49 */ uint64_t bbr_miss_tso_rwnd; /* 50 */ uint64_t bbr_miss_tso_cwnd; /* 51 */ uint64_t bbr_miss_tso_app; /* 52 */ uint64_t bbr_miss_retran; /* 53 */ uint64_t bbr_miss_tlp; /* 54 */ uint64_t bbr_miss_unknown; /* 55 */ uint64_t bbr_hdwr_rl_add_ok; /* 56 */ uint64_t bbr_hdwr_rl_add_fail; /* 57 */ uint64_t bbr_hdwr_rl_mod_ok; /* 58 */ uint64_t bbr_hdwr_rl_mod_fail; /* 59 */ uint64_t bbr_collapsed_win; /* 60 */ uint64_t bbr_alloc_limited; /* 61 */ uint64_t bbr_alloc_limited_conns; /* 62 */ uint64_t bbr_split_limited; /* 63 */ }; /* * The structure bbr_opt_stats is a simple * way to see how many options are being * changed in the stack. */ struct bbr_opts_stats { uint64_t tcp_bbr_pace_per_sec; uint64_t tcp_bbr_pace_del_tar; uint64_t tcp_bbr_pace_seg_max; uint64_t tcp_bbr_pace_seg_min; uint64_t tcp_bbr_pace_cross; uint64_t tcp_bbr_drain_inc_extra; uint64_t tcp_bbr_unlimited; uint64_t tcp_bbr_iwintso; uint64_t tcp_bbr_rec_over_hpts; uint64_t tcp_bbr_recforce; uint64_t tcp_bbr_startup_pg; uint64_t tcp_bbr_drain_pg; uint64_t tcp_bbr_rwnd_is_app; uint64_t tcp_bbr_probe_rtt_int; uint64_t tcp_bbr_one_retran; uint64_t tcp_bbr_startup_loss_exit; uint64_t tcp_bbr_use_lowgain; uint64_t tcp_bbr_lowgain_thresh; uint64_t tcp_bbr_lowgain_half; uint64_t tcp_bbr_lowgain_fd; uint64_t tcp_bbr_usedel_rate; uint64_t tcp_bbr_min_rto; uint64_t tcp_bbr_max_rto; uint64_t tcp_rack_pace_max_seg; uint64_t tcp_rack_min_to; uint64_t tcp_rack_reord_thresh; uint64_t tcp_rack_reord_fade; uint64_t tcp_rack_tlp_thresh; uint64_t tcp_rack_pkt_delay; uint64_t tcp_bbr_startup_exit_epoch; uint64_t tcp_bbr_ack_comp_alg; uint64_t tcp_rack_cheat; uint64_t tcp_iwnd_tso; uint64_t tcp_utter_max_tso; uint64_t tcp_hdwr_pacing; uint64_t tcp_extra_state; uint64_t tcp_floor_min_tso; /* New */ uint64_t tcp_bbr_algorithm; uint64_t tcp_bbr_tslimits; uint64_t tcp_bbr_probertt_len; uint64_t tcp_bbr_probertt_gain; uint64_t tcp_bbr_topaceout; uint64_t tcp_use_rackcheat; uint64_t tcp_delack; uint64_t tcp_maxpeak; uint64_t tcp_retran_wtso; uint64_t tcp_data_ac; uint64_t tcp_ts_raises; uint64_t tcp_pacing_oh_tmr; uint64_t tcp_pacing_oh; uint64_t tcp_policer_det; }; #ifdef _KERNEL #define BBR_STAT_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t)) extern counter_u64_t bbr_stat_arry[BBR_STAT_SIZE]; #define BBR_STAT_ADD(name, amm) counter_u64_add(bbr_stat_arry[(offsetof(struct bbr_stats, name)/sizeof(uint64_t))], (amm)) #define BBR_STAT_INC(name) BBR_STAT_ADD(name, 1) #define BBR_OPTS_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t)) extern counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE]; #define BBR_OPTS_ADD(name, amm) counter_u64_add(bbr_opts_arry[(offsetof(struct bbr_opts_stats, name)/sizeof(uint64_t))], (amm)) #define BBR_OPTS_INC(name) BBR_OPTS_ADD(name, 1) #endif #define BBR_NUM_LOSS_RATES 3 #define BBR_NUM_BW_RATES 3 #define BBR_RECOVERY_LOWRTT 1 #define BBR_RECOVERY_MEDRTT 2 #define BBR_RECOVERY_HIGHRTT 3 #define BBR_RECOVERY_EXTREMERTT 4 struct bbr_control { /*******************************/ /* Cache line 2 from bbr start */ /*******************************/ struct bbr_head rc_map; /* List of all segments Lock(a) */ struct bbr_head rc_tmap; /* List in transmit order Lock(a) */ struct bbr_sendmap *rc_resend; /* something we have been asked to * resend */ uint32_t rc_last_delay_val; /* How much we expect to delay Lock(a) */ uint32_t rc_bbr_hptsi_gain:16, /* Current hptsi gain Lock(a) */ rc_hpts_flags:16; /* flags on whats on the pacer wheel */ uint32_t rc_delivered; /* BRR delivered amount Lock(a) */ uint32_t rc_hptsi_agg_delay; /* How much time are we behind */ uint32_t rc_flight_at_input; uint32_t rc_lost_bytes; /* Total bytes currently marked lost */ /*******************************/ /* Cache line 3 from bbr start */ /*******************************/ struct time_filter rc_delrate; /*******************************/ /* Cache line 4 from bbr start */ /*******************************/ struct bbr_head rc_free; /* List of Free map entries Lock(a) */ struct bbr_sendmap *rc_tlp_send; /* something we have been * asked to resend */ uint32_t rc_del_time; uint32_t rc_target_at_state; /* Target for a state */ uint16_t rc_free_cnt; /* Number of free entries on the rc_free list * Lock(a) */ uint16_t rc_startup_pg; uint32_t cur_rtt; /* Last RTT from ack */ uint32_t rc_went_idle_time; /* Used for persits to see if its * probe-rtt qualified */ uint32_t rc_pace_max_segs:17, /* How much in any single TSO we send Lock(a) */ rc_pace_min_segs:15; /* The minimum single segment size before we enter persists */ uint32_t rc_rtt_shrinks; /* Time of last rtt shrinkage Lock(a) */ uint32_t r_app_limited_until; uint32_t rc_timer_exp; /* If a timer ticks of expiry */ uint32_t rc_rcv_epoch_start; /* Start time of the Epoch Lock(a) */ /*******************************/ /* Cache line 5 from bbr start */ /*******************************/ uint32_t rc_lost_at_pktepoch; /* what the lost value was at the last * pkt-epoch */ uint32_t r_measurement_count; /* count of measurement applied lock(a) */ uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ struct bbr_sendmap *rc_sacklast; /* sack remembered place * Lock(a) */ struct bbr_sendmap *rc_next; /* remembered place where we next * retransmit at Lock(a) */ uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */ /*- --- - * used only inital and close + * used only initial and close */ uint32_t rc_high_rwnd; /* Highest rwnd seen */ uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */ uint32_t rc_last_rtt; /* Last valid measured RTT that ack'd data */ uint32_t bbr_cross_over; /*******************************/ /* Cache line 6 from bbr start */ /*******************************/ struct sack_filter bbr_sf; /*******************************/ /* Cache line 7 from bbr start */ /*******************************/ struct time_filter_small rc_rttprop; uint32_t last_inbound_ts; /* Peers last timestamp */ uint32_t rc_inc_tcp_oh: 1, rc_inc_ip_oh: 1, rc_inc_enet_oh:1, rc_incr_tmrs:1, restrict_growth:28; uint32_t rc_lt_epoch_use; /* When we started lt-bw use Lock(a) */ uint32_t rc_recovery_start; /* Time we start recovery Lock(a) */ uint32_t rc_lt_del; /* Delivered at lt bw sampling start Lock(a) */ uint64_t rc_bbr_cur_del_rate; /* Current measured delivery rate * Lock(a) */ /*******************************/ /* Cache line 8 from bbr start */ /*******************************/ uint32_t rc_cwnd_on_ent; /* On entry to recovery the cwnd * Lock(a) */ uint32_t rc_agg_early; /* aggregate amount early */ uint32_t rc_rcvtime; /* When we last received data Lock(a) */ uint32_t rc_pkt_epoch_del; /* seq num that we need for RTT epoch */ uint32_t rc_pkt_epoch; /* Epoch based on packet RTTs */ uint32_t rc_pkt_epoch_time; /* Time we started the pkt epoch */ uint32_t rc_pkt_epoch_rtt; /* RTT using the packet epoch */ uint32_t rc_rtt_epoch; /* Current RTT epoch, it ticks every rttProp * Lock(a) */ uint32_t lowest_rtt; uint32_t bbr_smallest_srtt_this_state; uint32_t rc_lt_epoch; /* LT epoch start of bw_sampling */ uint32_t rc_lost_at_startup; uint32_t rc_bbr_state_atflight; uint32_t rc_bbr_last_startup_epoch; /* Last startup epoch where we * increased 20% */ uint32_t rc_bbr_enters_probertt; /* Timestamp we entered * probertt Lock(a) */ uint32_t rc_lt_time; /* Time of lt sampling start Lock(a) */ /*******************************/ /* Cache line 9 from bbr start */ /*******************************/ uint64_t rc_lt_bw; /* LT bw calculated Lock(a) */ uint64_t rc_bbr_lastbtlbw; /* For startup, what was last btlbw I * saw to check the 20% gain Lock(a) */ uint32_t rc_bbr_cwnd_gain; /* Current cwnd gain Lock(a) */ uint32_t rc_pkt_epoch_loss_rate; /* pkt-epoch loss rate */ uint32_t rc_saved_cwnd; /* Saved cwnd during Probe-rtt drain Lock(a) */ uint32_t substate_pe; uint32_t rc_lost; /* Number of bytes lost Lock(a) */ uint32_t rc_exta_time_gd; /* How much extra time we got in d/g */ uint32_t rc_lt_lost; /* Number of lt bytes lost at sampling start * Lock(a) */ uint32_t rc_bbr_state_time; uint32_t rc_min_to; /* Socket option value Lock(a) */ uint32_t rc_initial_hptsi_bw; /* Our initial startup bw Lock(a) */ uint32_t bbr_lost_at_state; /* Temp counter debug lost value as we * enter a state */ /*******************************/ /* Cache line 10 from bbr start */ /*******************************/ uint32_t rc_level_state_extra; uint32_t rc_red_cwnd_pe; const struct tcp_hwrate_limit_table *crte; uint64_t red_bw; uint32_t rc_probertt_int; uint32_t rc_probertt_srttchktim; /* Time we last did a srtt * check */ uint32_t gain_epoch; /* Epoch we should be out of gain */ uint32_t rc_min_rto_ms; uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ uint32_t last_startup_measure; int32_t bbr_hptsi_per_second; int32_t bbr_hptsi_segments_delay_tar; int32_t bbr_hptsi_segments_max; uint32_t bbr_rttprobe_gain_val; /*******************************/ /* Cache line 11 from bbr start */ /*******************************/ uint32_t cur_rtt_send_time; /* Time we sent our rtt measured packet */ uint32_t bbr_peer_tsratio; /* Our calculated ts ratio to multply */ uint32_t bbr_ts_check_tstmp; /* When we filled it the TS that came on the ack */ uint32_t bbr_ts_check_our_cts; /* When we filled it the cts of the send */ uint32_t rc_tlp_rxt_last_time; uint32_t bbr_smallest_srtt_state2; uint32_t bbr_hdwr_cnt_noset_snt; /* count of hw pacing sends during delay */ uint32_t startup_last_srtt; uint32_t rc_ack_hdwr_delay; uint32_t highest_hdwr_delay; /* Largest delay we have seen from hardware */ uint32_t non_gain_extra; uint32_t recovery_lr; /* The sum of the loss rate from the pe's during recovery */ uint32_t last_in_probertt; uint32_t flightsize_at_drain; /* In draining what was the last marked flight size */ uint32_t rc_pe_of_prtt; /* PE we went into probe-rtt */ uint32_t ts_in; /* ts that went with the last rtt */ uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent * rc_last_tlp_seq Lock(a) */ uint16_t rc_drain_pg; uint32_t rc_num_maps_alloced; /* num send map entries allocated */ uint32_t rc_num_split_allocs; /* num split map entries allocated */ uint16_t rc_num_small_maps_alloced; /* Number of sack blocks * allocated */ uint16_t bbr_hptsi_bytes_min; uint16_t bbr_hptsi_segments_floor; uint16_t bbr_utter_max; uint16_t bbr_google_discount; }; struct socket; struct tcp_bbr { /* First cache line 0x00 */ int32_t(*r_substate) (struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, struct tcpopt *, int32_t, int32_t, uint32_t, int32_t, int32_t); /* Lock(a) */ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ struct inpcb *rc_inp; /* The inpcb Lock(a) */ struct timeval rc_tv; uint32_t rc_pacer_started; /* Time we started the pacer */ uint16_t no_pacing_until:8, /* No pacing until N packet epochs */ ts_can_raise:1,/* TS b/w calculations can raise the bw higher */ skip_gain:1, /* Skip the gain cycle (hardware pacing) */ gain_is_limited:1, /* With hardware pacing we are limiting gain */ output_error_seen:1, oerror_cnt:4, hw_pacing_set:1; /* long enough has passed for us to start pacing */ uint16_t xxx_r_ack_count; /* During recovery count of ack's received * that added data since output */ uint16_t bbr_segs_rcvd; /* In Segment count since we sent a ack */ uint8_t bbr_timer_src:4, /* Used for debugging Lock(a) */ bbr_use_rack_cheat:1, /* Use the rack cheat */ bbr_init_win_cheat:1, /* Send full IW for TSO */ bbr_attempt_hdwr_pace:1,/* Try to do hardware pacing */ bbr_hdrw_pacing:1; /* Hardware pacing is available */ uint8_t bbr_hdw_pace_ena:1, /* Does the connection allow hardware pacing to be attempted */ bbr_prev_in_rec:1, /* We were previously in recovery */ pkt_conservation:1, use_policer_detection:1, xxx_bbr_hdw_pace_idx:4; /* If hardware pacing is on, index to slot in pace tbl */ uint16_t r_wanted_output:1, rtt_valid:1, rc_timer_first:1, rc_output_starts_timer:1, rc_resends_use_tso:1, rc_all_timers_stopped:1, rc_loss_exit:1, rc_ack_was_delayed:1, rc_lt_is_sampling:1, rc_filled_pipe:1, rc_tlp_new_data:1, rc_hit_state_1:1, rc_ts_valid:1, rc_prtt_set_ts:1, rc_is_pkt_epoch_now:1, rc_has_collapsed:1; uint8_t r_state:4, /* Current bbr state Lock(a) */ r_agg_early_set:1, /* Did we get called early */ r_init_rtt:1, r_use_policer:1, /* For google mode only */ r_recovery_bw:1; uint8_t r_timer_override:1, /* pacer override Lock(a) 0/1 */ rc_in_persist:1, rc_lt_use_bw:1, rc_allow_data_af_clo:1, rc_tlp_rtx_out:1, /* A TLP is in flight */ rc_tlp_in_progress:1, /* a TLP timer is running needed? */ rc_use_idle_restart:1; /* Do we restart fast after idle (persist or applim) */ uint8_t rc_bbr_state:3, /* What is the major BBR state */ rc_bbr_substate:3, /* For probeBW state */ r_is_v6:1, rc_past_init_win:1; uint8_t rc_last_options; uint8_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint8_t rc_max_rto_sec; uint8_t rc_cwnd_limited:1, /* We are cwnd limited */ rc_tmr_stopped:7; /* What timers have been stopped */ uint8_t rc_use_google:1, rc_use_ts_limit:1, rc_ts_data_set:1, /* We have filled a set point to determine */ rc_ts_clock_set:1, /* We have determined the ts type */ rc_ts_cant_be_used:1, /* We determined we can't use ts values */ rc_ack_is_cumack:1, rc_no_pacing:1, alloc_limit_reported:1; uint8_t rc_init_win; /* Cache line 2 0x40 */ struct bbr_control r_ctl; #ifdef _KERNEL } __aligned(CACHE_LINE_SIZE); #else }; #endif #endif Index: head/sys/opencrypto/xform_enc.h =================================================================== --- head/sys/opencrypto/xform_enc.h (revision 357663) +++ head/sys/opencrypto/xform_enc.h (revision 357664) @@ -1,102 +1,102 @@ /* $FreeBSD$ */ /* $OpenBSD: xform.h,v 1.8 2001/08/28 12:20:43 ben Exp $ */ /*- * The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu) * * This code was written by Angelos D. Keromytis in Athens, Greece, in * February 2000. Network Security Technologies Inc. (NSTI) kindly * supported the development of this code. * * Copyright (c) 2000 Angelos D. Keromytis * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by John-Mark Gurney * under sponsorship of the FreeBSD Foundation and * Rubicon Communications, LLC (Netgate). * * Permission to use, copy, and modify this software without fee * is hereby granted, provided that this entire notice is included in * all source code copies of any software which is or includes a copy or * modification of this software. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #ifndef _CRYPTO_XFORM_ENC_H_ #define _CRYPTO_XFORM_ENC_H_ #include #include #include #include #include #include #include #include #include #include #define AESICM_BLOCKSIZE AES_BLOCK_LEN #define AES_XTS_BLOCKSIZE 16 #define AES_XTS_IVSIZE 8 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ /* Declarations */ struct enc_xform { int type; char *name; u_int16_t blocksize; /* Required input block size -- 1 for stream ciphers. */ u_int16_t ivsize; u_int16_t minkey, maxkey; void (*encrypt) (caddr_t, u_int8_t *); void (*decrypt) (caddr_t, u_int8_t *); int (*setkey) (u_int8_t **, const u_int8_t *, int len); void (*zerokey) (u_int8_t **); void (*reinit) (caddr_t, const u_int8_t *); /* * Encrypt/decrypt 1+ blocks of input -- total size is 'len' bytes. * Len is guaranteed to be a multiple of the defined 'blocksize'. * Optional interface -- most useful for stream ciphers with a small * blocksize (1). */ void (*encrypt_multi) (void *, uint8_t *, size_t len); void (*decrypt_multi) (void *, uint8_t *, size_t len); }; extern struct enc_xform enc_xform_null; extern struct enc_xform enc_xform_des; extern struct enc_xform enc_xform_3des; extern struct enc_xform enc_xform_blf; extern struct enc_xform enc_xform_cast5; extern struct enc_xform enc_xform_skipjack; extern struct enc_xform enc_xform_rijndael128; extern struct enc_xform enc_xform_aes_icm; extern struct enc_xform enc_xform_aes_nist_gcm; extern struct enc_xform enc_xform_aes_nist_gmac; extern struct enc_xform enc_xform_aes_xts; extern struct enc_xform enc_xform_arc4; extern struct enc_xform enc_xform_camellia; extern struct enc_xform enc_xform_chacha20; extern struct enc_xform enc_xform_ccm; struct aes_icm_ctx { u_int32_t ac_ek[4*(RIJNDAEL_MAXNR + 1)]; - /* ac_block is initalized to IV */ + /* ac_block is initialized to IV */ u_int8_t ac_block[AESICM_BLOCKSIZE]; int ac_nr; }; struct aes_xts_ctx { rijndael_ctx key1; rijndael_ctx key2; u_int8_t tweak[AES_XTS_BLOCKSIZE]; }; #endif /* _CRYPTO_XFORM_ENC_H_ */ Index: head/usr.bin/truss/syscalls.c =================================================================== --- head/usr.bin/truss/syscalls.c (revision 357663) +++ head/usr.bin/truss/syscalls.c (revision 357664) @@ -1,2801 +1,2801 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright 1997 Sean Eric Fagan * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan * 4. Neither the name of the author may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * This file has routines used to print out system calls and their * arguments. */ #include #include #define _WANT_FREEBSD11_KEVENT #include #include #include #include #include #include #include #define _WANT_FREEBSD11_STAT #include #include #include #include #include #include #include #include #include #include #include #define _WANT_KERNEL_ERRNO #include #include #include #include #include #include #include #include #include #include #include #include #include #include "truss.h" #include "extern.h" #include "syscall.h" /* * This should probably be in its own file, sorted alphabetically. */ static struct syscall decoded_syscalls[] = { /* Native ABI */ { .name = "__acl_aclcheck_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_aclcheck_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_aclcheck_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_delete_fd", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Acltype, 1 } } }, { .name = "__acl_delete_file", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Acltype, 1 } } }, { .name = "__acl_delete_link", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Acltype, 1 } } }, { .name = "__acl_get_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_get_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_get_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__cap_rights_get", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { CapRights | OUT, 2 } } }, { .name = "__getcwd", .ret_type = 1, .nargs = 2, .args = { { Name | OUT, 0 }, { Int, 1 } } }, { .name = "_umtx_op", .ret_type = 1, .nargs = 5, .args = { { Ptr, 0 }, { Umtxop, 1 }, { LongHex, 2 }, { Ptr, 3 }, { Ptr, 4 } } }, { .name = "accept", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "access", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Accessmode, 1 } } }, { .name = "bind", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } }, { .name = "bindat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 }, { Int, 3 } } }, { .name = "break", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "cap_fcntls_get", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapFcntlRights | OUT, 1 } } }, { .name = "cap_fcntls_limit", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapFcntlRights, 1 } } }, { .name = "cap_getmode", .ret_type = 1, .nargs = 1, .args = { { PUInt | OUT, 0 } } }, { .name = "cap_rights_limit", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapRights, 1 } } }, { .name = "chdir", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "chflags", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { FileFlags, 1 } } }, { .name = "chflagsat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { FileFlags, 2 }, { Atflags, 3 } } }, { .name = "chmod", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "chown", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "chroot", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "clock_gettime", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec | OUT, 1 } } }, { .name = "close", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "compat11.fstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Stat11 | OUT, 1 } } }, { .name = "compat11.fstatat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat11 | OUT, 2 }, { Atflags, 3 } } }, { .name = "compat11.kevent", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { Kevent11, 1 }, { Int, 2 }, { Kevent11 | OUT, 3 }, { Int, 4 }, { Timespec, 5 } } }, { .name = "compat11.lstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } }, { .name = "compat11.stat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } }, { .name = "connect", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } }, { .name = "connectat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 }, { Int, 3 } } }, { .name = "dup", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "dup2", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "eaccess", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Accessmode, 1 } } }, { .name = "execve", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 }, { ExecEnv | IN, 2 } } }, { .name = "exit", .ret_type = 0, .nargs = 1, .args = { { Hex, 0 } } }, { .name = "extattr_delete_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_delete_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_delete_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_get_fd", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_get_file", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_get_link", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_list_fd", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_list_file", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_list_link", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_set_fd", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattr_set_file", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattr_set_link", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattrctl", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Hex, 1 }, { Name, 2 }, { Extattrnamespace, 3 }, { Name, 4 } } }, { .name = "faccessat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Accessmode, 2 }, { Atflags, 3 } } }, { .name = "fchflags", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { FileFlags, 1 } } }, { .name = "fchmod", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Octal, 1 } } }, { .name = "fchmodat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Atflags, 3 } } }, { .name = "fchown", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "fchownat", .ret_type = 1, .nargs = 5, .args = { { Atfd, 0 }, { Name, 1 }, { Int, 2 }, { Int, 3 }, { Atflags, 4 } } }, { .name = "fcntl", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Fcntl, 1 }, { Fcntlflag, 2 } } }, { .name = "fdatasync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "flock", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Flockop, 1 } } }, { .name = "fstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Stat | OUT, 1 } } }, { .name = "fstatat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat | OUT, 2 }, { Atflags, 3 } } }, { .name = "fstatfs", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { StatFs | OUT, 1 } } }, { .name = "fsync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "ftruncate", .ret_type = 1, .nargs = 2, .args = { { Int | IN, 0 }, { QuadHex | IN, 1 } } }, { .name = "futimens", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec2 | IN, 1 } } }, { .name = "futimes", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timeval2 | IN, 1 } } }, { .name = "futimesat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timeval2 | IN, 2 } } }, { .name = "getdirentries", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 }, { PQuadHex | OUT, 3 } } }, { .name = "getfsstat", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Long, 1 }, { Getfsstatmode, 2 } } }, { .name = "getitimer", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Itimerval | OUT, 2 } } }, { .name = "getpeername", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "getpgid", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "getpriority", .ret_type = 1, .nargs = 2, .args = { { Priowhich, 0 }, { Int, 1 } } }, { .name = "getrandom", .ret_type = 1, .nargs = 3, .args = { { BinString | OUT, 0 }, { Sizet, 1 }, { UInt, 2 } } }, { .name = "getrlimit", .ret_type = 1, .nargs = 2, .args = { { Resource, 0 }, { Rlimit | OUT, 1 } } }, { .name = "getrusage", .ret_type = 1, .nargs = 2, .args = { { RusageWho, 0 }, { Rusage | OUT, 1 } } }, { .name = "getsid", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "getsockname", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "getsockopt", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 }, { Ptr | OUT, 3 }, { Ptr | OUT, 4 } } }, { .name = "gettimeofday", .ret_type = 1, .nargs = 2, .args = { { Timeval | OUT, 0 }, { Ptr, 1 } } }, { .name = "ioctl", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Ioctl, 1 }, { Ptr, 2 } } }, { .name = "kevent", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { Kevent, 1 }, { Int, 2 }, { Kevent | OUT, 3 }, { Int, 4 }, { Timespec, 5 } } }, { .name = "kill", .ret_type = 1, .nargs = 2, .args = { { Int | IN, 0 }, { Signal | IN, 1 } } }, { .name = "kldfind", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "kldfirstmod", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldload", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "kldnext", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Ptr, 1 } } }, { .name = "kldsym", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Kldsymcmd, 1 }, { Ptr, 2 } } }, { .name = "kldunload", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldunloadf", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Kldunloadflags, 1 } } }, { .name = "kse_release", .ret_type = 0, .nargs = 1, .args = { { Timespec, 0 } } }, { .name = "lchflags", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { FileFlags, 1 } } }, { .name = "lchmod", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "lchown", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "link", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "linkat", .ret_type = 1, .nargs = 5, .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 }, { Atflags, 4 } } }, { .name = "listen", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "lseek", .ret_type = 2, .nargs = 3, .args = { { Int, 0 }, { QuadHex, 1 }, { Whence, 2 } } }, { .name = "lstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } }, { .name = "lutimes", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } }, { .name = "madvise", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Madvice, 2 } } }, { .name = "minherit", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Minherit, 2 } } }, { .name = "mkdir", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "mkdirat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } }, { .name = "mkfifo", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "mkfifoat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } }, { .name = "mknod", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Octal, 1 }, { Int, 2 } } }, { .name = "mknodat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Int, 3 } } }, { .name = "mlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "mlockall", .ret_type = 1, .nargs = 1, .args = { { Mlockall, 0 } } }, { .name = "mmap", .ret_type = 1, .nargs = 6, .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 }, { Mmapflags, 3 }, { Int, 4 }, { QuadHex, 5 } } }, { .name = "modfind", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "mount", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Name, 1 }, { Mountflags, 2 }, { Ptr, 3 } } }, { .name = "mprotect", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 } } }, { .name = "msync", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Msync, 2 } } }, { .name = "munlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "munmap", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "nanosleep", .ret_type = 1, .nargs = 1, .args = { { Timespec, 0 } } }, { .name = "nmount", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { UInt, 1 }, { Mountflags, 2 } } }, { .name = "open", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { Open, 1 }, { Octal, 2 } } }, { .name = "openat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Open, 2 }, { Octal, 3 } } }, { .name = "pathconf", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Pathconf, 1 } } }, { .name = "pipe", .ret_type = 1, .nargs = 1, .args = { { PipeFds | OUT, 0 } } }, { .name = "pipe2", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Pipe2, 1 } } }, { .name = "poll", .ret_type = 1, .nargs = 3, .args = { { Pollfd, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "posix_fadvise", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { QuadHex, 1 }, { QuadHex, 2 }, { Fadvice, 3 } } }, { .name = "posix_openpt", .ret_type = 1, .nargs = 1, .args = { { Open, 0 } } }, { .name = "pread", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 }, { QuadHex, 3 } } }, { .name = "procctl", .ret_type = 1, .nargs = 4, .args = { { Idtype, 0 }, { Quad, 1 }, { Procctl, 2 }, { Ptr, 3 } } }, { .name = "ptrace", .ret_type = 1, .nargs = 4, .args = { { Ptraceop, 0 }, { Int, 1 }, { Ptr, 2 }, { Int, 3 } } }, { .name = "pwrite", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 }, { QuadHex, 3 } } }, { .name = "quotactl", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Quotactlcmd, 1 }, { Int, 2 }, { Ptr, 3 } } }, { .name = "read", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 } } }, { .name = "readlink", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Readlinkres | OUT, 1 }, { Sizet, 2 } } }, { .name = "readlinkat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Readlinkres | OUT, 2 }, { Sizet, 3 } } }, { .name = "readv", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 } } }, { .name = "reboot", .ret_type = 1, .nargs = 1, .args = { { Reboothowto, 0 } } }, { .name = "recvfrom", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 }, { Msgflags, 3 }, { Sockaddr | OUT, 4 }, { Ptr | OUT, 5 } } }, { .name = "recvmsg", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Msghdr | OUT, 1 }, { Msgflags, 2 } } }, { .name = "rename", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "renameat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 } } }, { .name = "rfork", .ret_type = 1, .nargs = 1, .args = { { Rforkflags, 0 } } }, { .name = "rmdir", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "rtprio", .ret_type = 1, .nargs = 3, .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } }, { .name = "rtprio_thread", .ret_type = 1, .nargs = 3, .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } }, { .name = "sched_get_priority_max", .ret_type = 1, .nargs = 1, .args = { { Schedpolicy, 0 } } }, { .name = "sched_get_priority_min", .ret_type = 1, .nargs = 1, .args = { { Schedpolicy, 0 } } }, { .name = "sched_getparam", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Schedparam | OUT, 1 } } }, { .name = "sched_getscheduler", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "sched_rr_get_interval", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec | OUT, 1 } } }, { .name = "sched_setparam", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Schedparam, 1 } } }, { .name = "sched_setscheduler", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Schedpolicy, 1 }, { Schedparam, 2 } } }, { .name = "sctp_generic_recvmsg", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 }, { Sockaddr | OUT, 3 }, { Ptr | OUT, 4 }, { Sctpsndrcvinfo | OUT, 5 }, { Ptr | OUT, 6 } } }, { .name = "sctp_generic_sendmsg", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 }, { Sockaddr | IN, 3 }, { Socklent, 4 }, { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } }, { .name = "sctp_generic_sendmsg_iov", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 }, { Sockaddr | IN, 3 }, { Socklent, 4 }, { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } }, { .name = "select", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Fd_set, 1 }, { Fd_set, 2 }, { Fd_set, 3 }, { Timeval, 4 } } }, { .name = "sendmsg", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Msghdr | IN, 1 }, { Msgflags, 2 } } }, { .name = "sendto", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 }, { Msgflags, 3 }, { Sockaddr | IN, 4 }, { Socklent | IN, 5 } } }, { .name = "setitimer", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Itimerval, 1 }, { Itimerval | OUT, 2 } } }, { .name = "setpriority", .ret_type = 1, .nargs = 3, .args = { { Priowhich, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "setrlimit", .ret_type = 1, .nargs = 2, .args = { { Resource, 0 }, { Rlimit | IN, 1 } } }, { .name = "setsockopt", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 }, { Ptr | IN, 3 }, { Socklent, 4 } } }, { .name = "shm_open", .ret_type = 1, .nargs = 3, .args = { { ShmName | IN, 0 }, { Open, 1 }, { Octal, 2 } } }, { .name = "shm_rename", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { Name | IN, 1 }, { Hex, 2 } } }, { .name = "shm_unlink", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "shutdown", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Shutdown, 1 } } }, { .name = "sigaction", .ret_type = 1, .nargs = 3, .args = { { Signal, 0 }, { Sigaction | IN, 1 }, { Sigaction | OUT, 2 } } }, { .name = "sigpending", .ret_type = 1, .nargs = 1, .args = { { Sigset | OUT, 0 } } }, { .name = "sigprocmask", .ret_type = 1, .nargs = 3, .args = { { Sigprocmask, 0 }, { Sigset, 1 }, { Sigset | OUT, 2 } } }, { .name = "sigqueue", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Signal, 1 }, { LongHex, 2 } } }, { .name = "sigreturn", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "sigsuspend", .ret_type = 1, .nargs = 1, .args = { { Sigset | IN, 0 } } }, { .name = "sigtimedwait", .ret_type = 1, .nargs = 3, .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 }, { Timespec | IN, 2 } } }, { .name = "sigwait", .ret_type = 1, .nargs = 2, .args = { { Sigset | IN, 0 }, { PSig | OUT, 1 } } }, { .name = "sigwaitinfo", .ret_type = 1, .nargs = 2, .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 } } }, { .name = "socket", .ret_type = 1, .nargs = 3, .args = { { Sockdomain, 0 }, { Socktype, 1 }, { Sockprotocol, 2 } } }, { .name = "stat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } }, { .name = "statfs", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { StatFs | OUT, 1 } } }, { .name = "symlink", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "symlinkat", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Atfd, 1 }, { Name, 2 } } }, { .name = "sysarch", .ret_type = 1, .nargs = 2, .args = { { Sysarch, 0 }, { Ptr, 1 } } }, { .name = "__sysctl", .ret_type = 1, .nargs = 6, .args = { { Sysctl, 0 }, { Sizet, 1 }, { Ptr, 2 }, { Ptr, 3 }, { Ptr, 4 }, { Sizet, 5 } } }, { .name = "__sysctlbyname", .ret_type = 1, .nargs = 6, .args = { { Name, 0 }, { Sizet, 1 }, { Ptr, 2 }, { Ptr, 3 }, { Ptr, 4}, { Sizet, 5 } } }, { .name = "thr_kill", .ret_type = 1, .nargs = 2, .args = { { Long, 0 }, { Signal, 1 } } }, { .name = "thr_self", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "thr_set_name", .ret_type = 1, .nargs = 2, .args = { { Long, 0 }, { Name, 1 } } }, { .name = "truncate", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { QuadHex | IN, 1 } } }, #if 0 /* Does not exist */ { .name = "umount", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Int, 2 } } }, #endif { .name = "unlink", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "unlinkat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Atflags, 2 } } }, { .name = "unmount", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Mountflags, 1 } } }, { .name = "utimensat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timespec2 | IN, 2 }, { Atflags, 3 } } }, { .name = "utimes", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } }, { .name = "utrace", .ret_type = 1, .nargs = 1, .args = { { Utrace, 0 } } }, { .name = "wait4", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { ExitStatus | OUT, 1 }, { Waitoptions, 2 }, { Rusage | OUT, 3 } } }, { .name = "wait6", .ret_type = 1, .nargs = 6, .args = { { Idtype, 0 }, { Quad, 1 }, { ExitStatus | OUT, 2 }, { Waitoptions, 3 }, { Rusage | OUT, 4 }, { Siginfo | OUT, 5 } } }, { .name = "write", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 } } }, { .name = "writev", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 } } }, /* Linux ABI */ { .name = "linux_access", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Accessmode, 1 } } }, { .name = "linux_execve", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 }, { ExecEnv | IN, 2 } } }, { .name = "linux_lseek", .ret_type = 2, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Whence, 2 } } }, { .name = "linux_mkdir", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Int, 1 } } }, { .name = "linux_newfstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Ptr | OUT, 1 } } }, { .name = "linux_newstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } }, { .name = "linux_open", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Hex, 1 }, { Octal, 2 } } }, { .name = "linux_readlink", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Name | OUT, 1 }, { Sizet, 2 } } }, { .name = "linux_socketcall", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { LinuxSockArgs, 1 } } }, { .name = "linux_stat64", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } }, /* CloudABI system calls. */ { .name = "cloudabi_sys_clock_res_get", .ret_type = 1, .nargs = 1, .args = { { CloudABIClockID, 0 } } }, { .name = "cloudabi_sys_clock_time_get", .ret_type = 1, .nargs = 2, .args = { { CloudABIClockID, 0 }, { CloudABITimestamp, 1 } } }, { .name = "cloudabi_sys_condvar_signal", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 }, { UInt, 2 } } }, { .name = "cloudabi_sys_fd_close", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_create1", .ret_type = 1, .nargs = 1, .args = { { CloudABIFileType, 0 } } }, { .name = "cloudabi_sys_fd_create2", .ret_type = 1, .nargs = 2, .args = { { CloudABIFileType, 0 }, { PipeFds | OUT, 0 } } }, { .name = "cloudabi_sys_fd_datasync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_dup", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_replace", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_fd_seek", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { CloudABIWhence, 2 } } }, { .name = "cloudabi_sys_fd_stat_get", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABIFDStat | OUT, 1 } } }, { .name = "cloudabi_sys_fd_stat_put", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { CloudABIFDStat | IN, 1 }, { CloudABIFDSFlags, 2 } } }, { .name = "cloudabi_sys_fd_sync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_file_advise", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 }, { CloudABIAdvice, 3 } } }, { .name = "cloudabi_sys_file_allocate", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "cloudabi_sys_file_create", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIFileType, 3 } } }, { .name = "cloudabi_sys_file_link", .ret_type = 1, .nargs = 4, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { Int, 3 }, { BinString | IN, 4 } } }, { .name = "cloudabi_sys_file_open", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIOFlags, 3 }, { CloudABIFDStat | IN, 4 } } }, { .name = "cloudabi_sys_file_readdir", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 }, { Int, 3 } } }, { .name = "cloudabi_sys_file_readlink", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { BinString | OUT, 3 }, { Int, 4 } } }, { .name = "cloudabi_sys_file_rename", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 3 }, { BinString | IN, 4 } } }, { .name = "cloudabi_sys_file_stat_fget", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABIFileStat | OUT, 1 } } }, { .name = "cloudabi_sys_file_stat_fput", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { CloudABIFileStat | IN, 1 }, { CloudABIFSFlags, 2 } } }, { .name = "cloudabi_sys_file_stat_get", .ret_type = 1, .nargs = 3, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { CloudABIFileStat | OUT, 3 } } }, { .name = "cloudabi_sys_file_stat_put", .ret_type = 1, .nargs = 4, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { CloudABIFileStat | IN, 3 }, { CloudABIFSFlags, 4 } } }, { .name = "cloudabi_sys_file_symlink", .ret_type = 1, .nargs = 3, .args = { { BinString | IN, 0 }, { Int, 2 }, { BinString | IN, 3 } } }, { .name = "cloudabi_sys_file_unlink", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIULFlags, 3 } } }, { .name = "cloudabi_sys_lock_unlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } }, { .name = "cloudabi_sys_mem_advise", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIAdvice, 2 } } }, { .name = "cloudabi_sys_mem_map", .ret_type = 1, .nargs = 6, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 }, { CloudABIMFlags, 3 }, { Int, 4 }, { Int, 5 } } }, { .name = "cloudabi_sys_mem_protect", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 } } }, { .name = "cloudabi_sys_mem_sync", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMSFlags, 2 } } }, { .name = "cloudabi_sys_mem_unmap", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_proc_exec", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 }, { IntArray, 3 }, { Int, 4 } } }, { .name = "cloudabi_sys_proc_exit", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_proc_fork", .ret_type = 1, .nargs = 0 }, { .name = "cloudabi_sys_proc_raise", .ret_type = 1, .nargs = 1, .args = { { CloudABISignal, 0 } } }, { .name = "cloudabi_sys_random_get", .ret_type = 1, .nargs = 2, .args = { { BinString | OUT, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_sock_shutdown", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABISDFlags, 1 } } }, { .name = "cloudabi_sys_thread_exit", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } }, { .name = "cloudabi_sys_thread_yield", .ret_type = 1, .nargs = 0 }, { .name = 0 }, }; static STAILQ_HEAD(, syscall) syscalls; /* Xlat idea taken from strace */ struct xlat { int val; const char *str; }; #define X(a) { a, #a }, #define XEND { 0, NULL } static struct xlat poll_flags[] = { X(POLLSTANDARD) X(POLLIN) X(POLLPRI) X(POLLOUT) X(POLLERR) X(POLLHUP) X(POLLNVAL) X(POLLRDNORM) X(POLLRDBAND) X(POLLWRBAND) X(POLLINIGNEOF) XEND }; static struct xlat sigaction_flags[] = { X(SA_ONSTACK) X(SA_RESTART) X(SA_RESETHAND) X(SA_NOCLDSTOP) X(SA_NODEFER) X(SA_NOCLDWAIT) X(SA_SIGINFO) XEND }; static struct xlat linux_socketcall_ops[] = { X(LINUX_SOCKET) X(LINUX_BIND) X(LINUX_CONNECT) X(LINUX_LISTEN) X(LINUX_ACCEPT) X(LINUX_GETSOCKNAME) X(LINUX_GETPEERNAME) X(LINUX_SOCKETPAIR) X(LINUX_SEND) X(LINUX_RECV) X(LINUX_SENDTO) X(LINUX_RECVFROM) X(LINUX_SHUTDOWN) X(LINUX_SETSOCKOPT) X(LINUX_GETSOCKOPT) X(LINUX_SENDMSG) X(LINUX_RECVMSG) XEND }; #undef X #define X(a) { CLOUDABI_##a, #a }, static struct xlat cloudabi_advice[] = { X(ADVICE_DONTNEED) X(ADVICE_NOREUSE) X(ADVICE_NORMAL) X(ADVICE_RANDOM) X(ADVICE_SEQUENTIAL) X(ADVICE_WILLNEED) XEND }; static struct xlat cloudabi_clockid[] = { X(CLOCK_MONOTONIC) X(CLOCK_PROCESS_CPUTIME_ID) X(CLOCK_REALTIME) X(CLOCK_THREAD_CPUTIME_ID) XEND }; static struct xlat cloudabi_fdflags[] = { X(FDFLAG_APPEND) X(FDFLAG_DSYNC) X(FDFLAG_NONBLOCK) X(FDFLAG_RSYNC) X(FDFLAG_SYNC) XEND }; static struct xlat cloudabi_fdsflags[] = { X(FDSTAT_FLAGS) X(FDSTAT_RIGHTS) XEND }; static struct xlat cloudabi_filetype[] = { X(FILETYPE_UNKNOWN) X(FILETYPE_BLOCK_DEVICE) X(FILETYPE_CHARACTER_DEVICE) X(FILETYPE_DIRECTORY) X(FILETYPE_PROCESS) X(FILETYPE_REGULAR_FILE) X(FILETYPE_SHARED_MEMORY) X(FILETYPE_SOCKET_DGRAM) X(FILETYPE_SOCKET_STREAM) X(FILETYPE_SYMBOLIC_LINK) XEND }; static struct xlat cloudabi_fsflags[] = { X(FILESTAT_ATIM) X(FILESTAT_ATIM_NOW) X(FILESTAT_MTIM) X(FILESTAT_MTIM_NOW) X(FILESTAT_SIZE) XEND }; static struct xlat cloudabi_mflags[] = { X(MAP_ANON) X(MAP_FIXED) X(MAP_PRIVATE) X(MAP_SHARED) XEND }; static struct xlat cloudabi_mprot[] = { X(PROT_EXEC) X(PROT_WRITE) X(PROT_READ) XEND }; static struct xlat cloudabi_msflags[] = { X(MS_ASYNC) X(MS_INVALIDATE) X(MS_SYNC) XEND }; static struct xlat cloudabi_oflags[] = { X(O_CREAT) X(O_DIRECTORY) X(O_EXCL) X(O_TRUNC) XEND }; static struct xlat cloudabi_sdflags[] = { X(SHUT_RD) X(SHUT_WR) XEND }; static struct xlat cloudabi_signal[] = { X(SIGABRT) X(SIGALRM) X(SIGBUS) X(SIGCHLD) X(SIGCONT) X(SIGFPE) X(SIGHUP) X(SIGILL) X(SIGINT) X(SIGKILL) X(SIGPIPE) X(SIGQUIT) X(SIGSEGV) X(SIGSTOP) X(SIGSYS) X(SIGTERM) X(SIGTRAP) X(SIGTSTP) X(SIGTTIN) X(SIGTTOU) X(SIGURG) X(SIGUSR1) X(SIGUSR2) X(SIGVTALRM) X(SIGXCPU) X(SIGXFSZ) XEND }; static struct xlat cloudabi_ulflags[] = { X(UNLINK_REMOVEDIR) XEND }; static struct xlat cloudabi_whence[] = { X(WHENCE_CUR) X(WHENCE_END) X(WHENCE_SET) XEND }; #undef X #undef XEND /* * Searches an xlat array for a value, and returns it if found. Otherwise * return a string representation. */ static const char * lookup(struct xlat *xlat, int val, int base) { static char tmp[16]; for (; xlat->str != NULL; xlat++) if (xlat->val == val) return (xlat->str); switch (base) { case 8: sprintf(tmp, "0%o", val); break; case 16: sprintf(tmp, "0x%x", val); break; case 10: sprintf(tmp, "%u", val); break; default: errx(1,"Unknown lookup base"); break; } return (tmp); } static const char * xlookup(struct xlat *xlat, int val) { return (lookup(xlat, val, 16)); } /* * Searches an xlat array containing bitfield values. Remaining bits * set after removing the known ones are printed at the end: * IN|0x400. */ static char * xlookup_bits(struct xlat *xlat, int val) { int len, rem; static char str[512]; len = 0; rem = val; for (; xlat->str != NULL; xlat++) { if ((xlat->val & rem) == xlat->val) { /* * Don't print the "all-bits-zero" string unless all * bits are really zero. */ if (xlat->val == 0 && val != 0) continue; len += sprintf(str + len, "%s|", xlat->str); rem &= ~(xlat->val); } } /* * If we have leftover bits or didn't match anything, print * the remainder. */ if (rem || len == 0) len += sprintf(str + len, "0x%x", rem); if (len && str[len - 1] == '|') len--; str[len] = 0; return (str); } static void print_integer_arg(const char *(*decoder)(int), FILE *fp, int value) { const char *str; str = decoder(value); if (str != NULL) fputs(str, fp); else fprintf(fp, "%d", value); } static void print_mask_arg(bool (*decoder)(FILE *, int, int *), FILE *fp, int value) { int rem; if (!decoder(fp, value, &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); } static void print_mask_arg32(bool (*decoder)(FILE *, uint32_t, uint32_t *), FILE *fp, uint32_t value) { uint32_t rem; if (!decoder(fp, value, &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); } #ifndef __LP64__ /* * Add argument padding to subsequent system calls after Quad * syscall arguments as needed. This used to be done by hand in the * decoded_syscalls table which was ugly and error prone. It is - * simpler to do the fixup of offsets at initalization time than when + * simpler to do the fixup of offsets at initialization time than when * decoding arguments. */ static void quad_fixup(struct syscall *sc) { int offset, prev; u_int i; offset = 0; prev = -1; for (i = 0; i < sc->nargs; i++) { /* This arg type is a dummy that doesn't use offset. */ if ((sc->args[i].type & ARG_MASK) == PipeFds) continue; assert(prev < sc->args[i].offset); prev = sc->args[i].offset; sc->args[i].offset += offset; switch (sc->args[i].type & ARG_MASK) { case Quad: case QuadHex: #ifdef __powerpc__ /* * 64-bit arguments on 32-bit powerpc must be * 64-bit aligned. If the current offset is * not aligned, the calling convention inserts * a 32-bit pad argument that should be skipped. */ if (sc->args[i].offset % 2 == 1) { sc->args[i].offset++; offset++; } #endif offset++; default: break; } } } #endif void init_syscalls(void) { struct syscall *sc; STAILQ_INIT(&syscalls); for (sc = decoded_syscalls; sc->name != NULL; sc++) { #ifndef __LP64__ quad_fixup(sc); #endif STAILQ_INSERT_HEAD(&syscalls, sc, entries); } } static struct syscall * find_syscall(struct procabi *abi, u_int number) { struct extra_syscall *es; if (number < nitems(abi->syscalls)) return (abi->syscalls[number]); STAILQ_FOREACH(es, &abi->extra_syscalls, entries) { if (es->number == number) return (es->sc); } return (NULL); } static void add_syscall(struct procabi *abi, u_int number, struct syscall *sc) { struct extra_syscall *es; if (number < nitems(abi->syscalls)) { assert(abi->syscalls[number] == NULL); abi->syscalls[number] = sc; } else { es = malloc(sizeof(*es)); es->sc = sc; es->number = number; STAILQ_INSERT_TAIL(&abi->extra_syscalls, es, entries); } } /* * If/when the list gets big, it might be desirable to do it * as a hash table or binary search. */ struct syscall * get_syscall(struct threadinfo *t, u_int number, u_int nargs) { struct syscall *sc; const char *name; char *new_name; u_int i; sc = find_syscall(t->proc->abi, number); if (sc != NULL) return (sc); name = sysdecode_syscallname(t->proc->abi->abi, number); if (name == NULL) { asprintf(&new_name, "#%d", number); name = new_name; } else new_name = NULL; STAILQ_FOREACH(sc, &syscalls, entries) { if (strcmp(name, sc->name) == 0) { add_syscall(t->proc->abi, number, sc); free(new_name); return (sc); } } /* It is unknown. Add it into the list. */ #if DEBUG fprintf(stderr, "unknown syscall %s -- setting args to %d\n", name, nargs); #endif sc = calloc(1, sizeof(struct syscall)); sc->name = name; if (new_name != NULL) sc->unknown = true; sc->ret_type = 1; sc->nargs = nargs; for (i = 0; i < nargs; i++) { sc->args[i].offset = i; /* Treat all unknown arguments as LongHex. */ sc->args[i].type = LongHex; } STAILQ_INSERT_HEAD(&syscalls, sc, entries); add_syscall(t->proc->abi, number, sc); return (sc); } /* * Copy a fixed amount of bytes from the process. */ static int get_struct(pid_t pid, uintptr_t offset, void *buf, int len) { struct ptrace_io_desc iorequest; iorequest.piod_op = PIOD_READ_D; iorequest.piod_offs = (void *)offset; iorequest.piod_addr = buf; iorequest.piod_len = len; if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0) return (-1); return (0); } #define MAXSIZE 4096 /* * Copy a string from the process. Note that it is * expected to be a C string, but if max is set, it will * only get that much. */ static char * get_string(pid_t pid, uintptr_t addr, int max) { struct ptrace_io_desc iorequest; char *buf, *nbuf; size_t offset, size, totalsize; offset = 0; if (max) size = max + 1; else { /* Read up to the end of the current page. */ size = PAGE_SIZE - ((uintptr_t)addr % PAGE_SIZE); if (size > MAXSIZE) size = MAXSIZE; } totalsize = size; buf = malloc(totalsize); if (buf == NULL) return (NULL); for (;;) { iorequest.piod_op = PIOD_READ_D; iorequest.piod_offs = (void *)(addr + offset); iorequest.piod_addr = buf + offset; iorequest.piod_len = size; if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0) { free(buf); return (NULL); } if (memchr(buf + offset, '\0', size) != NULL) return (buf); offset += size; if (totalsize < MAXSIZE && max == 0) { size = MAXSIZE - totalsize; if (size > PAGE_SIZE) size = PAGE_SIZE; nbuf = realloc(buf, totalsize + size); if (nbuf == NULL) { buf[totalsize - 1] = '\0'; return (buf); } buf = nbuf; totalsize += size; } else { buf[totalsize - 1] = '\0'; return (buf); } } } static const char * strsig2(int sig) { static char tmp[32]; const char *signame; signame = sysdecode_signal(sig); if (signame == NULL) { snprintf(tmp, sizeof(tmp), "%d", sig); signame = tmp; } return (signame); } static void print_kevent(FILE *fp, struct kevent *ke) { switch (ke->filter) { case EVFILT_READ: case EVFILT_WRITE: case EVFILT_VNODE: case EVFILT_PROC: case EVFILT_TIMER: case EVFILT_PROCDESC: case EVFILT_EMPTY: fprintf(fp, "%ju", (uintmax_t)ke->ident); break; case EVFILT_SIGNAL: fputs(strsig2(ke->ident), fp); break; default: fprintf(fp, "%p", (void *)ke->ident); } fprintf(fp, ","); print_integer_arg(sysdecode_kevent_filter, fp, ke->filter); fprintf(fp, ","); print_mask_arg(sysdecode_kevent_flags, fp, ke->flags); fprintf(fp, ","); sysdecode_kevent_fflags(fp, ke->filter, ke->fflags, 16); fprintf(fp, ",%#jx,%p", (uintmax_t)ke->data, ke->udata); } static void print_utrace(FILE *fp, void *utrace_addr, size_t len) { unsigned char *utrace_buffer; fprintf(fp, "{ "); if (sysdecode_utrace(fp, utrace_addr, len)) { fprintf(fp, " }"); return; } utrace_buffer = utrace_addr; fprintf(fp, "%zu:", len); while (len--) fprintf(fp, " %02x", *utrace_buffer++); fprintf(fp, " }"); } static void print_pointer(FILE *fp, uintptr_t arg) { fprintf(fp, "%p", (void *)arg); } static void print_sockaddr(FILE *fp, struct trussinfo *trussinfo, uintptr_t arg, socklen_t len) { char addr[64]; struct sockaddr_in *lsin; struct sockaddr_in6 *lsin6; struct sockaddr_un *sun; struct sockaddr *sa; u_char *q; pid_t pid = trussinfo->curthread->proc->pid; if (arg == 0) { fputs("NULL", fp); return; } /* If the length is too small, just bail. */ if (len < sizeof(*sa)) { print_pointer(fp, arg); return; } sa = calloc(1, len); if (get_struct(pid, arg, sa, len) == -1) { free(sa); print_pointer(fp, arg); return; } switch (sa->sa_family) { case AF_INET: if (len < sizeof(*lsin)) goto sockaddr_short; lsin = (struct sockaddr_in *)(void *)sa; inet_ntop(AF_INET, &lsin->sin_addr, addr, sizeof(addr)); fprintf(fp, "{ AF_INET %s:%d }", addr, htons(lsin->sin_port)); break; case AF_INET6: if (len < sizeof(*lsin6)) goto sockaddr_short; lsin6 = (struct sockaddr_in6 *)(void *)sa; inet_ntop(AF_INET6, &lsin6->sin6_addr, addr, sizeof(addr)); fprintf(fp, "{ AF_INET6 [%s]:%d }", addr, htons(lsin6->sin6_port)); break; case AF_UNIX: sun = (struct sockaddr_un *)sa; fprintf(fp, "{ AF_UNIX \"%.*s\" }", (int)(len - offsetof(struct sockaddr_un, sun_path)), sun->sun_path); break; default: sockaddr_short: fprintf(fp, "{ sa_len = %d, sa_family = %d, sa_data = {", (int)sa->sa_len, (int)sa->sa_family); for (q = (u_char *)sa->sa_data; q < (u_char *)sa + len; q++) fprintf(fp, "%s 0x%02x", q == (u_char *)sa->sa_data ? "" : ",", *q); fputs(" } }", fp); } free(sa); } #define IOV_LIMIT 16 static void print_iovec(FILE *fp, struct trussinfo *trussinfo, uintptr_t arg, int iovcnt) { struct iovec iov[IOV_LIMIT]; size_t max_string = trussinfo->strsize; char tmp2[max_string + 1], *tmp3; size_t len; pid_t pid = trussinfo->curthread->proc->pid; int i; bool buf_truncated, iov_truncated; if (iovcnt <= 0) { print_pointer(fp, arg); return; } if (iovcnt > IOV_LIMIT) { iovcnt = IOV_LIMIT; iov_truncated = true; } else { iov_truncated = false; } if (get_struct(pid, arg, &iov, iovcnt * sizeof(struct iovec)) == -1) { print_pointer(fp, arg); return; } fputs("[", fp); for (i = 0; i < iovcnt; i++) { len = iov[i].iov_len; if (len > max_string) { len = max_string; buf_truncated = true; } else { buf_truncated = false; } fprintf(fp, "%s{", (i > 0) ? "," : ""); if (len && get_struct(pid, (uintptr_t)iov[i].iov_base, &tmp2, len) != -1) { tmp3 = malloc(len * 4 + 1); while (len) { if (strvisx(tmp3, tmp2, len, VIS_CSTYLE|VIS_TAB|VIS_NL) <= (int)max_string) break; len--; buf_truncated = true; } fprintf(fp, "\"%s\"%s", tmp3, buf_truncated ? "..." : ""); free(tmp3); } else { print_pointer(fp, (uintptr_t)iov[i].iov_base); } fprintf(fp, ",%zu}", iov[i].iov_len); } fprintf(fp, "%s%s", iov_truncated ? ",..." : "", "]"); } static void print_gen_cmsg(FILE *fp, struct cmsghdr *cmsghdr) { u_char *q; fputs("{", fp); for (q = CMSG_DATA(cmsghdr); q < (u_char *)cmsghdr + cmsghdr->cmsg_len; q++) { fprintf(fp, "%s0x%02x", q == CMSG_DATA(cmsghdr) ? "" : ",", *q); } fputs("}", fp); } static void print_sctp_initmsg(FILE *fp, struct sctp_initmsg *init) { fprintf(fp, "{out=%u,", init->sinit_num_ostreams); fprintf(fp, "in=%u,", init->sinit_max_instreams); fprintf(fp, "max_rtx=%u,", init->sinit_max_attempts); fprintf(fp, "max_rto=%u}", init->sinit_max_init_timeo); } static void print_sctp_sndrcvinfo(FILE *fp, bool receive, struct sctp_sndrcvinfo *info) { fprintf(fp, "{sid=%u,", info->sinfo_stream); if (receive) { fprintf(fp, "ssn=%u,", info->sinfo_ssn); } fputs("flgs=", fp); sysdecode_sctp_sinfo_flags(fp, info->sinfo_flags); fprintf(fp, ",ppid=%u,", ntohl(info->sinfo_ppid)); if (!receive) { fprintf(fp, "ctx=%u,", info->sinfo_context); fprintf(fp, "ttl=%u,", info->sinfo_timetolive); } if (receive) { fprintf(fp, "tsn=%u,", info->sinfo_tsn); fprintf(fp, "cumtsn=%u,", info->sinfo_cumtsn); } fprintf(fp, "id=%u}", info->sinfo_assoc_id); } static void print_sctp_sndinfo(FILE *fp, struct sctp_sndinfo *info) { fprintf(fp, "{sid=%u,", info->snd_sid); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_snd_flags, fp, info->snd_flags); fprintf(fp, ",ppid=%u,", ntohl(info->snd_ppid)); fprintf(fp, "ctx=%u,", info->snd_context); fprintf(fp, "id=%u}", info->snd_assoc_id); } static void print_sctp_rcvinfo(FILE *fp, struct sctp_rcvinfo *info) { fprintf(fp, "{sid=%u,", info->rcv_sid); fprintf(fp, "ssn=%u,", info->rcv_ssn); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_rcv_flags, fp, info->rcv_flags); fprintf(fp, ",ppid=%u,", ntohl(info->rcv_ppid)); fprintf(fp, "tsn=%u,", info->rcv_tsn); fprintf(fp, "cumtsn=%u,", info->rcv_cumtsn); fprintf(fp, "ctx=%u,", info->rcv_context); fprintf(fp, "id=%u}", info->rcv_assoc_id); } static void print_sctp_nxtinfo(FILE *fp, struct sctp_nxtinfo *info) { fprintf(fp, "{sid=%u,", info->nxt_sid); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_nxt_flags, fp, info->nxt_flags); fprintf(fp, ",ppid=%u,", ntohl(info->nxt_ppid)); fprintf(fp, "len=%u,", info->nxt_length); fprintf(fp, "id=%u}", info->nxt_assoc_id); } static void print_sctp_prinfo(FILE *fp, struct sctp_prinfo *info) { fputs("{pol=", fp); print_integer_arg(sysdecode_sctp_pr_policy, fp, info->pr_policy); fprintf(fp, ",val=%u}", info->pr_value); } static void print_sctp_authinfo(FILE *fp, struct sctp_authinfo *info) { fprintf(fp, "{num=%u}", info->auth_keynumber); } static void print_sctp_ipv4_addr(FILE *fp, struct in_addr *addr) { char buf[INET_ADDRSTRLEN]; const char *s; s = inet_ntop(AF_INET, addr, buf, INET_ADDRSTRLEN); if (s != NULL) fprintf(fp, "{addr=%s}", s); else fputs("{addr=???}", fp); } static void print_sctp_ipv6_addr(FILE *fp, struct in6_addr *addr) { char buf[INET6_ADDRSTRLEN]; const char *s; s = inet_ntop(AF_INET6, addr, buf, INET6_ADDRSTRLEN); if (s != NULL) fprintf(fp, "{addr=%s}", s); else fputs("{addr=???}", fp); } static void print_sctp_cmsg(FILE *fp, bool receive, struct cmsghdr *cmsghdr) { void *data; socklen_t len; len = cmsghdr->cmsg_len; data = CMSG_DATA(cmsghdr); switch (cmsghdr->cmsg_type) { case SCTP_INIT: if (len == CMSG_LEN(sizeof(struct sctp_initmsg))) print_sctp_initmsg(fp, (struct sctp_initmsg *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_SNDRCV: if (len == CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) print_sctp_sndrcvinfo(fp, receive, (struct sctp_sndrcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; #if 0 case SCTP_EXTRCV: if (len == CMSG_LEN(sizeof(struct sctp_extrcvinfo))) print_sctp_extrcvinfo(fp, (struct sctp_extrcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; #endif case SCTP_SNDINFO: if (len == CMSG_LEN(sizeof(struct sctp_sndinfo))) print_sctp_sndinfo(fp, (struct sctp_sndinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_RCVINFO: if (len == CMSG_LEN(sizeof(struct sctp_rcvinfo))) print_sctp_rcvinfo(fp, (struct sctp_rcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_NXTINFO: if (len == CMSG_LEN(sizeof(struct sctp_nxtinfo))) print_sctp_nxtinfo(fp, (struct sctp_nxtinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_PRINFO: if (len == CMSG_LEN(sizeof(struct sctp_prinfo))) print_sctp_prinfo(fp, (struct sctp_prinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_AUTHINFO: if (len == CMSG_LEN(sizeof(struct sctp_authinfo))) print_sctp_authinfo(fp, (struct sctp_authinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_DSTADDRV4: if (len == CMSG_LEN(sizeof(struct in_addr))) print_sctp_ipv4_addr(fp, (struct in_addr *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_DSTADDRV6: if (len == CMSG_LEN(sizeof(struct in6_addr))) print_sctp_ipv6_addr(fp, (struct in6_addr *)data); else print_gen_cmsg(fp, cmsghdr); break; default: print_gen_cmsg(fp, cmsghdr); } } static void print_cmsgs(FILE *fp, pid_t pid, bool receive, struct msghdr *msghdr) { struct cmsghdr *cmsghdr; char *cmsgbuf; const char *temp; socklen_t len; int level, type; bool first; len = msghdr->msg_controllen; if (len == 0) { fputs("{}", fp); return; } cmsgbuf = calloc(1, len); if (get_struct(pid, (uintptr_t)msghdr->msg_control, cmsgbuf, len) == -1) { print_pointer(fp, (uintptr_t)msghdr->msg_control); free(cmsgbuf); return; } msghdr->msg_control = cmsgbuf; first = true; fputs("{", fp); for (cmsghdr = CMSG_FIRSTHDR(msghdr); cmsghdr != NULL; cmsghdr = CMSG_NXTHDR(msghdr, cmsghdr)) { level = cmsghdr->cmsg_level; type = cmsghdr->cmsg_type; len = cmsghdr->cmsg_len; fprintf(fp, "%s{level=", first ? "" : ","); print_integer_arg(sysdecode_sockopt_level, fp, level); fputs(",type=", fp); temp = sysdecode_cmsg_type(level, type); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", type); } fputs(",data=", fp); switch (level) { case IPPROTO_SCTP: print_sctp_cmsg(fp, receive, cmsghdr); break; default: print_gen_cmsg(fp, cmsghdr); break; } fputs("}", fp); first = false; } fputs("}", fp); free(cmsgbuf); } static void print_sysctl_oid(FILE *fp, int *oid, int len) { int i; for (i = 0; i < len; i++) fprintf(fp, ".%d", oid[i]); } /* * Converts a syscall argument into a string. Said string is * allocated via malloc(), so needs to be free()'d. sc is * a pointer to the syscall description (see above); args is * an array of all of the system call arguments. */ char * print_arg(struct syscall_args *sc, unsigned long *args, register_t *retval, struct trussinfo *trussinfo) { FILE *fp; char *tmp; size_t tmplen; pid_t pid; fp = open_memstream(&tmp, &tmplen); pid = trussinfo->curthread->proc->pid; switch (sc->type & ARG_MASK) { case Hex: fprintf(fp, "0x%x", (int)args[sc->offset]); break; case Octal: fprintf(fp, "0%o", (int)args[sc->offset]); break; case Int: fprintf(fp, "%d", (int)args[sc->offset]); break; case UInt: fprintf(fp, "%u", (unsigned int)args[sc->offset]); break; case PUInt: { unsigned int val; if (get_struct(pid, args[sc->offset], &val, sizeof(val)) == 0) fprintf(fp, "{ %u }", val); else print_pointer(fp, args[sc->offset]); break; } case LongHex: fprintf(fp, "0x%lx", args[sc->offset]); break; case Long: fprintf(fp, "%ld", args[sc->offset]); break; case Sizet: fprintf(fp, "%zu", (size_t)args[sc->offset]); break; case ShmName: /* Handle special SHM_ANON value. */ if ((char *)args[sc->offset] == SHM_ANON) { fprintf(fp, "SHM_ANON"); break; } /* FALLTHROUGH */ case Name: { /* NULL-terminated string. */ char *tmp2; tmp2 = get_string(pid, args[sc->offset], 0); fprintf(fp, "\"%s\"", tmp2); free(tmp2); break; } case BinString: { /* * Binary block of data that might have printable characters. * XXX If type|OUT, assume that the length is the syscall's * return value. Otherwise, assume that the length of the block * is in the next syscall argument. */ int max_string = trussinfo->strsize; char tmp2[max_string + 1], *tmp3; int len; int truncated = 0; if (sc->type & OUT) len = retval[0]; else len = args[sc->offset + 1]; /* * Don't print more than max_string characters, to avoid word * wrap. If we have to truncate put some ... after the string. */ if (len > max_string) { len = max_string; truncated = 1; } if (len && get_struct(pid, args[sc->offset], &tmp2, len) != -1) { tmp3 = malloc(len * 4 + 1); while (len) { if (strvisx(tmp3, tmp2, len, VIS_CSTYLE|VIS_TAB|VIS_NL) <= max_string) break; len--; truncated = 1; } fprintf(fp, "\"%s\"%s", tmp3, truncated ? "..." : ""); free(tmp3); } else { print_pointer(fp, args[sc->offset]); } break; } case ExecArgs: case ExecEnv: case StringArray: { uintptr_t addr; union { char *strarray[0]; char buf[PAGE_SIZE]; } u; char *string; size_t len; u_int first, i; /* * Only parse argv[] and environment arrays from exec calls * if requested. */ if (((sc->type & ARG_MASK) == ExecArgs && (trussinfo->flags & EXECVEARGS) == 0) || ((sc->type & ARG_MASK) == ExecEnv && (trussinfo->flags & EXECVEENVS) == 0)) { print_pointer(fp, args[sc->offset]); break; } /* * Read a page of pointers at a time. Punt if the top-level * pointer is not aligned. Note that the first read is of * a partial page. */ addr = args[sc->offset]; if (addr % sizeof(char *) != 0) { print_pointer(fp, args[sc->offset]); break; } len = PAGE_SIZE - (addr & PAGE_MASK); if (get_struct(pid, addr, u.buf, len) == -1) { print_pointer(fp, args[sc->offset]); break; } fputc('[', fp); first = 1; i = 0; while (u.strarray[i] != NULL) { string = get_string(pid, (uintptr_t)u.strarray[i], 0); fprintf(fp, "%s \"%s\"", first ? "" : ",", string); free(string); first = 0; i++; if (i == len / sizeof(char *)) { addr += len; len = PAGE_SIZE; if (get_struct(pid, addr, u.buf, len) == -1) { fprintf(fp, ", "); break; } i = 0; } } fputs(" ]", fp); break; } #ifdef __LP64__ case Quad: fprintf(fp, "%ld", args[sc->offset]); break; case QuadHex: fprintf(fp, "0x%lx", args[sc->offset]); break; #else case Quad: case QuadHex: { unsigned long long ll; #if _BYTE_ORDER == _LITTLE_ENDIAN ll = (unsigned long long)args[sc->offset + 1] << 32 | args[sc->offset]; #else ll = (unsigned long long)args[sc->offset] << 32 | args[sc->offset + 1]; #endif if ((sc->type & ARG_MASK) == Quad) fprintf(fp, "%lld", ll); else fprintf(fp, "0x%llx", ll); break; } #endif case PQuadHex: { uint64_t val; if (get_struct(pid, args[sc->offset], &val, sizeof(val)) == 0) fprintf(fp, "{ 0x%jx }", (uintmax_t)val); else print_pointer(fp, args[sc->offset]); break; } case Ptr: print_pointer(fp, args[sc->offset]); break; case Readlinkres: { char *tmp2; if (retval[0] == -1) break; tmp2 = get_string(pid, args[sc->offset], retval[0]); fprintf(fp, "\"%s\"", tmp2); free(tmp2); break; } case Ioctl: { const char *temp; unsigned long cmd; cmd = args[sc->offset]; temp = sysdecode_ioctlname(cmd); if (temp) fputs(temp, fp); else { fprintf(fp, "0x%lx { IO%s%s 0x%lx('%c'), %lu, %lu }", cmd, cmd & IOC_OUT ? "R" : "", cmd & IOC_IN ? "W" : "", IOCGROUP(cmd), isprint(IOCGROUP(cmd)) ? (char)IOCGROUP(cmd) : '?', cmd & 0xFF, IOCPARM_LEN(cmd)); } break; } case Timespec: { struct timespec ts; if (get_struct(pid, args[sc->offset], &ts, sizeof(ts)) != -1) fprintf(fp, "{ %jd.%09ld }", (intmax_t)ts.tv_sec, ts.tv_nsec); else print_pointer(fp, args[sc->offset]); break; } case Timespec2: { struct timespec ts[2]; const char *sep; unsigned int i; if (get_struct(pid, args[sc->offset], &ts, sizeof(ts)) != -1) { fputs("{ ", fp); sep = ""; for (i = 0; i < nitems(ts); i++) { fputs(sep, fp); sep = ", "; switch (ts[i].tv_nsec) { case UTIME_NOW: fprintf(fp, "UTIME_NOW"); break; case UTIME_OMIT: fprintf(fp, "UTIME_OMIT"); break; default: fprintf(fp, "%jd.%09ld", (intmax_t)ts[i].tv_sec, ts[i].tv_nsec); break; } } fputs(" }", fp); } else print_pointer(fp, args[sc->offset]); break; } case Timeval: { struct timeval tv; if (get_struct(pid, args[sc->offset], &tv, sizeof(tv)) != -1) fprintf(fp, "{ %jd.%06ld }", (intmax_t)tv.tv_sec, tv.tv_usec); else print_pointer(fp, args[sc->offset]); break; } case Timeval2: { struct timeval tv[2]; if (get_struct(pid, args[sc->offset], &tv, sizeof(tv)) != -1) fprintf(fp, "{ %jd.%06ld, %jd.%06ld }", (intmax_t)tv[0].tv_sec, tv[0].tv_usec, (intmax_t)tv[1].tv_sec, tv[1].tv_usec); else print_pointer(fp, args[sc->offset]); break; } case Itimerval: { struct itimerval itv; if (get_struct(pid, args[sc->offset], &itv, sizeof(itv)) != -1) fprintf(fp, "{ %jd.%06ld, %jd.%06ld }", (intmax_t)itv.it_interval.tv_sec, itv.it_interval.tv_usec, (intmax_t)itv.it_value.tv_sec, itv.it_value.tv_usec); else print_pointer(fp, args[sc->offset]); break; } case LinuxSockArgs: { struct linux_socketcall_args largs; if (get_struct(pid, args[sc->offset], (void *)&largs, sizeof(largs)) != -1) fprintf(fp, "{ %s, 0x%lx }", lookup(linux_socketcall_ops, largs.what, 10), (long unsigned int)largs.args); else print_pointer(fp, args[sc->offset]); break; } case Pollfd: { /* * XXX: A Pollfd argument expects the /next/ syscall argument * to be the number of fds in the array. This matches the poll * syscall. */ struct pollfd *pfd; int numfds = args[sc->offset + 1]; size_t bytes = sizeof(struct pollfd) * numfds; int i; if ((pfd = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for pollfd array", bytes); if (get_struct(pid, args[sc->offset], pfd, bytes) != -1) { fputs("{", fp); for (i = 0; i < numfds; i++) { fprintf(fp, " %d/%s", pfd[i].fd, xlookup_bits(poll_flags, pfd[i].events)); } fputs(" }", fp); } else { print_pointer(fp, args[sc->offset]); } free(pfd); break; } case Fd_set: { /* * XXX: A Fd_set argument expects the /first/ syscall argument * to be the number of fds in the array. This matches the * select syscall. */ fd_set *fds; int numfds = args[0]; size_t bytes = _howmany(numfds, _NFDBITS) * _NFDBITS; int i; if ((fds = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for fd_set array", bytes); if (get_struct(pid, args[sc->offset], fds, bytes) != -1) { fputs("{", fp); for (i = 0; i < numfds; i++) { if (FD_ISSET(i, fds)) fprintf(fp, " %d", i); } fputs(" }", fp); } else print_pointer(fp, args[sc->offset]); free(fds); break; } case Signal: fputs(strsig2(args[sc->offset]), fp); break; case Sigset: { long sig; sigset_t ss; int i, first; sig = args[sc->offset]; if (get_struct(pid, args[sc->offset], (void *)&ss, sizeof(ss)) == -1) { print_pointer(fp, args[sc->offset]); break; } fputs("{ ", fp); first = 1; for (i = 1; i < sys_nsig; i++) { if (sigismember(&ss, i)) { fprintf(fp, "%s%s", !first ? "|" : "", strsig2(i)); first = 0; } } if (!first) fputc(' ', fp); fputc('}', fp); break; } case Sigprocmask: print_integer_arg(sysdecode_sigprocmask_how, fp, args[sc->offset]); break; case Fcntlflag: /* XXX: Output depends on the value of the previous argument. */ if (sysdecode_fcntl_arg_p(args[sc->offset - 1])) sysdecode_fcntl_arg(fp, args[sc->offset - 1], args[sc->offset], 16); break; case Open: print_mask_arg(sysdecode_open_flags, fp, args[sc->offset]); break; case Fcntl: print_integer_arg(sysdecode_fcntl_cmd, fp, args[sc->offset]); break; case Mprot: print_mask_arg(sysdecode_mmap_prot, fp, args[sc->offset]); break; case Mmapflags: print_mask_arg(sysdecode_mmap_flags, fp, args[sc->offset]); break; case Whence: print_integer_arg(sysdecode_whence, fp, args[sc->offset]); break; case Sockdomain: print_integer_arg(sysdecode_socketdomain, fp, args[sc->offset]); break; case Socktype: print_mask_arg(sysdecode_socket_type, fp, args[sc->offset]); break; case Shutdown: print_integer_arg(sysdecode_shutdown_how, fp, args[sc->offset]); break; case Resource: print_integer_arg(sysdecode_rlimit, fp, args[sc->offset]); break; case RusageWho: print_integer_arg(sysdecode_getrusage_who, fp, args[sc->offset]); break; case Pathconf: print_integer_arg(sysdecode_pathconf_name, fp, args[sc->offset]); break; case Rforkflags: print_mask_arg(sysdecode_rfork_flags, fp, args[sc->offset]); break; case Sockaddr: { socklen_t len; if (args[sc->offset] == 0) { fputs("NULL", fp); break; } /* * Extract the address length from the next argument. If * this is an output sockaddr (OUT is set), then the * next argument is a pointer to a socklen_t. Otherwise * the next argument contains a socklen_t by value. */ if (sc->type & OUT) { if (get_struct(pid, args[sc->offset + 1], &len, sizeof(len)) == -1) { print_pointer(fp, args[sc->offset]); break; } } else len = args[sc->offset + 1]; print_sockaddr(fp, trussinfo, args[sc->offset], len); break; } case Sigaction: { struct sigaction sa; if (get_struct(pid, args[sc->offset], &sa, sizeof(sa)) != -1) { fputs("{ ", fp); if (sa.sa_handler == SIG_DFL) fputs("SIG_DFL", fp); else if (sa.sa_handler == SIG_IGN) fputs("SIG_IGN", fp); else fprintf(fp, "%p", sa.sa_handler); fprintf(fp, " %s ss_t }", xlookup_bits(sigaction_flags, sa.sa_flags)); } else print_pointer(fp, args[sc->offset]); break; } case Kevent: { /* * XXX XXX: The size of the array is determined by either the * next syscall argument, or by the syscall return value, * depending on which argument number we are. This matches the * kevent syscall, but luckily that's the only syscall that uses * them. */ struct kevent *ke; int numevents = -1; size_t bytes; int i; if (sc->offset == 1) numevents = args[sc->offset+1]; else if (sc->offset == 3 && retval[0] != -1) numevents = retval[0]; if (numevents >= 0) { bytes = sizeof(struct kevent) * numevents; if ((ke = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for kevent array", bytes); } else ke = NULL; if (numevents >= 0 && get_struct(pid, args[sc->offset], ke, bytes) != -1) { fputc('{', fp); for (i = 0; i < numevents; i++) { fputc(' ', fp); print_kevent(fp, &ke[i]); } fputs(" }", fp); } else { print_pointer(fp, args[sc->offset]); } free(ke); break; } case Kevent11: { struct kevent_freebsd11 *ke11; struct kevent ke; int numevents = -1; size_t bytes; int i; if (sc->offset == 1) numevents = args[sc->offset+1]; else if (sc->offset == 3 && retval[0] != -1) numevents = retval[0]; if (numevents >= 0) { bytes = sizeof(struct kevent_freebsd11) * numevents; if ((ke11 = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for kevent array", bytes); } else ke11 = NULL; memset(&ke, 0, sizeof(ke)); if (numevents >= 0 && get_struct(pid, args[sc->offset], ke11, bytes) != -1) { fputc('{', fp); for (i = 0; i < numevents; i++) { fputc(' ', fp); ke.ident = ke11[i].ident; ke.filter = ke11[i].filter; ke.flags = ke11[i].flags; ke.fflags = ke11[i].fflags; ke.data = ke11[i].data; ke.udata = ke11[i].udata; print_kevent(fp, &ke); } fputs(" }", fp); } else { print_pointer(fp, args[sc->offset]); } free(ke11); break; } case Stat: { struct stat st; if (get_struct(pid, args[sc->offset], &st, sizeof(st)) != -1) { char mode[12]; strmode(st.st_mode, mode); fprintf(fp, "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode, (uintmax_t)st.st_ino, (intmax_t)st.st_size, (long)st.st_blksize); } else { print_pointer(fp, args[sc->offset]); } break; } case Stat11: { struct freebsd11_stat st; if (get_struct(pid, args[sc->offset], &st, sizeof(st)) != -1) { char mode[12]; strmode(st.st_mode, mode); fprintf(fp, "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode, (uintmax_t)st.st_ino, (intmax_t)st.st_size, (long)st.st_blksize); } else { print_pointer(fp, args[sc->offset]); } break; } case StatFs: { unsigned int i; struct statfs buf; if (get_struct(pid, args[sc->offset], &buf, sizeof(buf)) != -1) { char fsid[17]; bzero(fsid, sizeof(fsid)); if (buf.f_fsid.val[0] != 0 || buf.f_fsid.val[1] != 0) { for (i = 0; i < sizeof(buf.f_fsid); i++) snprintf(&fsid[i*2], sizeof(fsid) - (i*2), "%02x", ((u_char *)&buf.f_fsid)[i]); } fprintf(fp, "{ fstypename=%s,mntonname=%s,mntfromname=%s," "fsid=%s }", buf.f_fstypename, buf.f_mntonname, buf.f_mntfromname, fsid); } else print_pointer(fp, args[sc->offset]); break; } case Rusage: { struct rusage ru; if (get_struct(pid, args[sc->offset], &ru, sizeof(ru)) != -1) { fprintf(fp, "{ u=%jd.%06ld,s=%jd.%06ld,in=%ld,out=%ld }", (intmax_t)ru.ru_utime.tv_sec, ru.ru_utime.tv_usec, (intmax_t)ru.ru_stime.tv_sec, ru.ru_stime.tv_usec, ru.ru_inblock, ru.ru_oublock); } else print_pointer(fp, args[sc->offset]); break; } case Rlimit: { struct rlimit rl; if (get_struct(pid, args[sc->offset], &rl, sizeof(rl)) != -1) { fprintf(fp, "{ cur=%ju,max=%ju }", rl.rlim_cur, rl.rlim_max); } else print_pointer(fp, args[sc->offset]); break; } case ExitStatus: { int status; if (get_struct(pid, args[sc->offset], &status, sizeof(status)) != -1) { fputs("{ ", fp); if (WIFCONTINUED(status)) fputs("CONTINUED", fp); else if (WIFEXITED(status)) fprintf(fp, "EXITED,val=%d", WEXITSTATUS(status)); else if (WIFSIGNALED(status)) fprintf(fp, "SIGNALED,sig=%s%s", strsig2(WTERMSIG(status)), WCOREDUMP(status) ? ",cored" : ""); else fprintf(fp, "STOPPED,sig=%s", strsig2(WTERMSIG(status))); fputs(" }", fp); } else print_pointer(fp, args[sc->offset]); break; } case Waitoptions: print_mask_arg(sysdecode_wait6_options, fp, args[sc->offset]); break; case Idtype: print_integer_arg(sysdecode_idtype, fp, args[sc->offset]); break; case Procctl: print_integer_arg(sysdecode_procctl_cmd, fp, args[sc->offset]); break; case Umtxop: print_integer_arg(sysdecode_umtx_op, fp, args[sc->offset]); break; case Atfd: print_integer_arg(sysdecode_atfd, fp, args[sc->offset]); break; case Atflags: print_mask_arg(sysdecode_atflags, fp, args[sc->offset]); break; case Accessmode: print_mask_arg(sysdecode_access_mode, fp, args[sc->offset]); break; case Sysarch: print_integer_arg(sysdecode_sysarch_number, fp, args[sc->offset]); break; case Sysctl: { char name[BUFSIZ]; int oid[CTL_MAXNAME + 2], qoid[CTL_MAXNAME + 2]; size_t i; int len; memset(name, 0, sizeof(name)); len = args[sc->offset + 1]; if (get_struct(pid, args[sc->offset], oid, len * sizeof(oid[0])) != -1) { fprintf(fp, "\""); if (oid[0] == CTL_SYSCTL) { fprintf(fp, "sysctl."); switch (oid[1]) { case CTL_SYSCTL_DEBUG: fprintf(fp, "debug"); break; case CTL_SYSCTL_NAME: fprintf(fp, "name"); print_sysctl_oid(fp, oid + 2, len - 2); break; case CTL_SYSCTL_NEXT: fprintf(fp, "next"); break; case CTL_SYSCTL_NAME2OID: fprintf(fp, "name2oid"); break; case CTL_SYSCTL_OIDFMT: fprintf(fp, "oidfmt"); print_sysctl_oid(fp, oid + 2, len - 2); break; case CTL_SYSCTL_OIDDESCR: fprintf(fp, "oiddescr"); print_sysctl_oid(fp, oid + 2, len - 2); break; case CTL_SYSCTL_OIDLABEL: fprintf(fp, "oidlabel"); print_sysctl_oid(fp, oid + 2, len - 2); break; default: print_sysctl_oid(fp, oid + 1, len - 1); } } else { qoid[0] = CTL_SYSCTL; qoid[1] = CTL_SYSCTL_NAME; memcpy(qoid + 2, oid, len * sizeof(int)); i = sizeof(name); if (sysctl(qoid, len + 2, name, &i, 0, 0) == -1) print_sysctl_oid(fp, qoid + 2, len); else fprintf(fp, "%s", name); } fprintf(fp, "\""); } break; } case PipeFds: /* * The pipe() system call in the kernel returns its * two file descriptors via return values. However, * the interface exposed by libc is that pipe() * accepts a pointer to an array of descriptors. * Format the output to match the libc API by printing * the returned file descriptors as a fake argument. * * Overwrite the first retval to signal a successful * return as well. */ fprintf(fp, "{ %d, %d }", (int)retval[0], (int)retval[1]); retval[0] = 0; break; case Utrace: { size_t len; void *utrace_addr; len = args[sc->offset + 1]; utrace_addr = calloc(1, len); if (get_struct(pid, args[sc->offset], (void *)utrace_addr, len) != -1) print_utrace(fp, utrace_addr, len); else print_pointer(fp, args[sc->offset]); free(utrace_addr); break; } case IntArray: { int descriptors[16]; unsigned long i, ndescriptors; bool truncated; ndescriptors = args[sc->offset + 1]; truncated = false; if (ndescriptors > nitems(descriptors)) { ndescriptors = nitems(descriptors); truncated = true; } if (get_struct(pid, args[sc->offset], descriptors, ndescriptors * sizeof(descriptors[0])) != -1) { fprintf(fp, "{"); for (i = 0; i < ndescriptors; i++) fprintf(fp, i == 0 ? " %d" : ", %d", descriptors[i]); fprintf(fp, truncated ? ", ... }" : " }"); } else print_pointer(fp, args[sc->offset]); break; } case Pipe2: print_mask_arg(sysdecode_pipe2_flags, fp, args[sc->offset]); break; case CapFcntlRights: { uint32_t rights; if (sc->type & OUT) { if (get_struct(pid, args[sc->offset], &rights, sizeof(rights)) == -1) { print_pointer(fp, args[sc->offset]); break; } } else rights = args[sc->offset]; print_mask_arg32(sysdecode_cap_fcntlrights, fp, rights); break; } case Fadvice: print_integer_arg(sysdecode_fadvice, fp, args[sc->offset]); break; case FileFlags: { fflags_t rem; if (!sysdecode_fileflags(fp, args[sc->offset], &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); break; } case Flockop: print_mask_arg(sysdecode_flock_operation, fp, args[sc->offset]); break; case Getfsstatmode: print_integer_arg(sysdecode_getfsstat_mode, fp, args[sc->offset]); break; case Kldsymcmd: print_integer_arg(sysdecode_kldsym_cmd, fp, args[sc->offset]); break; case Kldunloadflags: print_integer_arg(sysdecode_kldunload_flags, fp, args[sc->offset]); break; case Madvice: print_integer_arg(sysdecode_madvice, fp, args[sc->offset]); break; case Socklent: fprintf(fp, "%u", (socklen_t)args[sc->offset]); break; case Sockprotocol: { const char *temp; int domain, protocol; domain = args[sc->offset - 2]; protocol = args[sc->offset]; if (protocol == 0) { fputs("0", fp); } else { temp = sysdecode_socket_protocol(domain, protocol); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", protocol); } } break; } case Sockoptlevel: print_integer_arg(sysdecode_sockopt_level, fp, args[sc->offset]); break; case Sockoptname: { const char *temp; int level, name; level = args[sc->offset - 1]; name = args[sc->offset]; temp = sysdecode_sockopt_name(level, name); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", name); } break; } case Msgflags: print_mask_arg(sysdecode_msg_flags, fp, args[sc->offset]); break; case CapRights: { cap_rights_t rights; if (get_struct(pid, args[sc->offset], &rights, sizeof(rights)) != -1) { fputs("{ ", fp); sysdecode_cap_rights(fp, &rights); fputs(" }", fp); } else print_pointer(fp, args[sc->offset]); break; } case Acltype: print_integer_arg(sysdecode_acltype, fp, args[sc->offset]); break; case Extattrnamespace: print_integer_arg(sysdecode_extattrnamespace, fp, args[sc->offset]); break; case Minherit: print_integer_arg(sysdecode_minherit_inherit, fp, args[sc->offset]); break; case Mlockall: print_mask_arg(sysdecode_mlockall_flags, fp, args[sc->offset]); break; case Mountflags: print_mask_arg(sysdecode_mount_flags, fp, args[sc->offset]); break; case Msync: print_mask_arg(sysdecode_msync_flags, fp, args[sc->offset]); break; case Priowhich: print_integer_arg(sysdecode_prio_which, fp, args[sc->offset]); break; case Ptraceop: print_integer_arg(sysdecode_ptrace_request, fp, args[sc->offset]); break; case Quotactlcmd: if (!sysdecode_quotactl_cmd(fp, args[sc->offset])) fprintf(fp, "%#x", (int)args[sc->offset]); break; case Reboothowto: print_mask_arg(sysdecode_reboot_howto, fp, args[sc->offset]); break; case Rtpriofunc: print_integer_arg(sysdecode_rtprio_function, fp, args[sc->offset]); break; case Schedpolicy: print_integer_arg(sysdecode_scheduler_policy, fp, args[sc->offset]); break; case Schedparam: { struct sched_param sp; if (get_struct(pid, args[sc->offset], &sp, sizeof(sp)) != -1) fprintf(fp, "{ %d }", sp.sched_priority); else print_pointer(fp, args[sc->offset]); break; } case PSig: { int sig; if (get_struct(pid, args[sc->offset], &sig, sizeof(sig)) == 0) fprintf(fp, "{ %s }", strsig2(sig)); else print_pointer(fp, args[sc->offset]); break; } case Siginfo: { siginfo_t si; if (get_struct(pid, args[sc->offset], &si, sizeof(si)) != -1) { fprintf(fp, "{ signo=%s", strsig2(si.si_signo)); decode_siginfo(fp, &si); fprintf(fp, " }"); } else print_pointer(fp, args[sc->offset]); break; } case Iovec: /* * Print argument as an array of struct iovec, where the next * syscall argument is the number of elements of the array. */ print_iovec(fp, trussinfo, args[sc->offset], (int)args[sc->offset + 1]); break; case Sctpsndrcvinfo: { struct sctp_sndrcvinfo info; if (get_struct(pid, args[sc->offset], &info, sizeof(struct sctp_sndrcvinfo)) == -1) { print_pointer(fp, args[sc->offset]); break; } print_sctp_sndrcvinfo(fp, sc->type & OUT, &info); break; } case Msghdr: { struct msghdr msghdr; if (get_struct(pid, args[sc->offset], &msghdr, sizeof(struct msghdr)) == -1) { print_pointer(fp, args[sc->offset]); break; } fputs("{", fp); print_sockaddr(fp, trussinfo, (uintptr_t)msghdr.msg_name, msghdr.msg_namelen); fprintf(fp, ",%d,", msghdr.msg_namelen); print_iovec(fp, trussinfo, (uintptr_t)msghdr.msg_iov, msghdr.msg_iovlen); fprintf(fp, ",%d,", msghdr.msg_iovlen); print_cmsgs(fp, pid, sc->type & OUT, &msghdr); fprintf(fp, ",%u,", msghdr.msg_controllen); print_mask_arg(sysdecode_msg_flags, fp, msghdr.msg_flags); fputs("}", fp); break; } case CloudABIAdvice: fputs(xlookup(cloudabi_advice, args[sc->offset]), fp); break; case CloudABIClockID: fputs(xlookup(cloudabi_clockid, args[sc->offset]), fp); break; case CloudABIFDSFlags: fputs(xlookup_bits(cloudabi_fdsflags, args[sc->offset]), fp); break; case CloudABIFDStat: { cloudabi_fdstat_t fds; if (get_struct(pid, args[sc->offset], &fds, sizeof(fds)) != -1) { fprintf(fp, "{ %s, ", xlookup(cloudabi_filetype, fds.fs_filetype)); fprintf(fp, "%s, ... }", xlookup_bits(cloudabi_fdflags, fds.fs_flags)); } else print_pointer(fp, args[sc->offset]); break; } case CloudABIFileStat: { cloudabi_filestat_t fsb; if (get_struct(pid, args[sc->offset], &fsb, sizeof(fsb)) != -1) fprintf(fp, "{ %s, %ju }", xlookup(cloudabi_filetype, fsb.st_filetype), (uintmax_t)fsb.st_size); else print_pointer(fp, args[sc->offset]); break; } case CloudABIFileType: fputs(xlookup(cloudabi_filetype, args[sc->offset]), fp); break; case CloudABIFSFlags: fputs(xlookup_bits(cloudabi_fsflags, args[sc->offset]), fp); break; case CloudABILookup: if ((args[sc->offset] & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) != 0) fprintf(fp, "%d|LOOKUP_SYMLINK_FOLLOW", (int)args[sc->offset]); else fprintf(fp, "%d", (int)args[sc->offset]); break; case CloudABIMFlags: fputs(xlookup_bits(cloudabi_mflags, args[sc->offset]), fp); break; case CloudABIMProt: fputs(xlookup_bits(cloudabi_mprot, args[sc->offset]), fp); break; case CloudABIMSFlags: fputs(xlookup_bits(cloudabi_msflags, args[sc->offset]), fp); break; case CloudABIOFlags: fputs(xlookup_bits(cloudabi_oflags, args[sc->offset]), fp); break; case CloudABISDFlags: fputs(xlookup_bits(cloudabi_sdflags, args[sc->offset]), fp); break; case CloudABISignal: fputs(xlookup(cloudabi_signal, args[sc->offset]), fp); break; case CloudABITimestamp: fprintf(fp, "%lu.%09lus", args[sc->offset] / 1000000000, args[sc->offset] % 1000000000); break; case CloudABIULFlags: fputs(xlookup_bits(cloudabi_ulflags, args[sc->offset]), fp); break; case CloudABIWhence: fputs(xlookup(cloudabi_whence, args[sc->offset]), fp); break; default: errx(1, "Invalid argument type %d\n", sc->type & ARG_MASK); } fclose(fp); return (tmp); } /* * Print (to outfile) the system call and its arguments. */ void print_syscall(struct trussinfo *trussinfo) { struct threadinfo *t; const char *name; char **s_args; int i, len, nargs; t = trussinfo->curthread; name = t->cs.sc->name; nargs = t->cs.nargs; s_args = t->cs.s_args; len = print_line_prefix(trussinfo); len += fprintf(trussinfo->outfile, "%s(", name); for (i = 0; i < nargs; i++) { if (s_args[i] != NULL) len += fprintf(trussinfo->outfile, "%s", s_args[i]); else len += fprintf(trussinfo->outfile, ""); len += fprintf(trussinfo->outfile, "%s", i < (nargs - 1) ? "," : ""); } len += fprintf(trussinfo->outfile, ")"); for (i = 0; i < 6 - (len / 8); i++) fprintf(trussinfo->outfile, "\t"); } void print_syscall_ret(struct trussinfo *trussinfo, int error, register_t *retval) { struct timespec timediff; struct threadinfo *t; struct syscall *sc; t = trussinfo->curthread; sc = t->cs.sc; if (trussinfo->flags & COUNTONLY) { timespecsub(&t->after, &t->before, &timediff); timespecadd(&sc->time, &timediff, &sc->time); sc->ncalls++; if (error != 0) sc->nerror++; return; } print_syscall(trussinfo); fflush(trussinfo->outfile); if (retval == NULL) { /* * This system call resulted in the current thread's exit, * so there is no return value or error to display. */ fprintf(trussinfo->outfile, "\n"); return; } if (error == ERESTART) fprintf(trussinfo->outfile, " ERESTART\n"); else if (error == EJUSTRETURN) fprintf(trussinfo->outfile, " EJUSTRETURN\n"); else if (error != 0) { fprintf(trussinfo->outfile, " ERR#%d '%s'\n", sysdecode_freebsd_to_abi_errno(t->proc->abi->abi, error), strerror(error)); } #ifndef __LP64__ else if (sc->ret_type == 2) { off_t off; #if _BYTE_ORDER == _LITTLE_ENDIAN off = (off_t)retval[1] << 32 | retval[0]; #else off = (off_t)retval[0] << 32 | retval[1]; #endif fprintf(trussinfo->outfile, " = %jd (0x%jx)\n", (intmax_t)off, (intmax_t)off); } #endif else fprintf(trussinfo->outfile, " = %jd (0x%jx)\n", (intmax_t)retval[0], (intmax_t)retval[0]); } void print_summary(struct trussinfo *trussinfo) { struct timespec total = {0, 0}; struct syscall *sc; int ncall, nerror; fprintf(trussinfo->outfile, "%-20s%15s%8s%8s\n", "syscall", "seconds", "calls", "errors"); ncall = nerror = 0; STAILQ_FOREACH(sc, &syscalls, entries) if (sc->ncalls) { fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n", sc->name, (intmax_t)sc->time.tv_sec, sc->time.tv_nsec, sc->ncalls, sc->nerror); timespecadd(&total, &sc->time, &total); ncall += sc->ncalls; nerror += sc->nerror; } fprintf(trussinfo->outfile, "%20s%15s%8s%8s\n", "", "-------------", "-------", "-------"); fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n", "", (intmax_t)total.tv_sec, total.tv_nsec, ncall, nerror); } Index: head/usr.sbin/bsdconfig/share/media/network.subr =================================================================== --- head/usr.sbin/bsdconfig/share/media/network.subr (revision 357663) +++ head/usr.sbin/bsdconfig/share/media/network.subr (revision 357664) @@ -1,182 +1,182 @@ if [ ! "$_MEDIA_NETWORK_SUBR" ]; then _MEDIA_NETWORK_SUBR=1 # # Copyright (c) 2012-2013 Devin Teske # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # ############################################################ INCLUDES BSDCFG_SHARE="/usr/share/bsdconfig" . $BSDCFG_SHARE/common.subr || exit 1 f_dprintf "%s: loading includes..." media/network.subr f_include $BSDCFG_SHARE/dialog.subr f_include $BSDCFG_SHARE/media/tcpip.subr BSDCFG_LIBE="/usr/libexec/bsdconfig" f_include_lang $BSDCFG_LIBE/include/messages.subr ############################################################ GLOBALS NETWORK_INITIALIZED= ############################################################ FUNCTIONS # f_media_init_network $device # # Initialize a network device (such as `fxp0', `em0', etc.). Returns success if # able to successfully initialize the device. If not running as init (basically # from the FreeBSD install media) then assume that the network has already been # initialized and returns success. # # The variables (from variable.subr) used to initialize the network are as -# follows (all of which are configured either automatically or manaully): +# follows (all of which are configured either automatically or manually): # # VAR_IFCONFIG + device_name (e.g., `ifconfig_em0') # Automatically populated but can be overridden in a script. This # defines the ifconfig(8) properties specific to a chosen network # interface device. Optional if VAR_IPV6ADDR is set. # VAR_IPV6ADDR [Optional] # If not running as init (and setting up RTSOL connections for # the interface), then must be set manually. If set, used as the # IPv6 configuration for the given network interface device. # VAR_GATEWAY [Optional] # If not running as init (and setting up a static connection for # the interface) then must be set (usually via rc.conf(5), but # can be set manually to override). If unset, the user is warned # but not prevented from proceeding (as most connections need a # default route but not everyone). # f_media_init_network() { local dev="$1" f_dprintf "Init routine called for network device \`%s'." "$dev" if [ "$NETWORK_INITIALIZED" ]; then f_dprintf "Network already initialized." return $SUCCESS elif ! f_running_as_init; then f_dprintf "Not running as init -- calling the deed done." NETWORK_INITIALIZED=1 return $SUCCESS fi if [ ! -e "$RESOLV_CONF" ]; then if ! f_config_resolv; then f_show_msg "$msg_cant_seem_to_write_out_resolv_conf" \ "$RESOLV_CONF" return $FAILURE fi fi local cp if f_getvar $VAR_IFCONFIG$dev cp; then # # If this interface isn't a DHCP one, bring it up. # If it is, then it's already up. # case "$cp" in *DHCP*) f_dprintf "A DHCP interface. Should already be up." ;; *) f_dprintf "Not a DHCP interface." if ! f_quietly ifconfig "$dev" $cp; then f_show_msg "$msg_unable_to_configure_device" \ "$dev" return $FAILURE fi local rp f_getvar $VAR_GATEWAY rp if [ ! "$rp" ]; then f_show_msg "$msg_no_gateway_has_been_set" else # # Explicitly flush all routes to get back to a # known sane state. We don't need to check this # exit code because if anything fails it will # show up in the route add below. # f_quietly route -n flush f_dprintf "Adding default route to %s." "$rp" if ! f_quietly route -n add default "$rp"; then f_show_msg \ "$msg_failed_to_add_default_route" return $FAILURE fi fi esac elif ! { f_getvar $VAR_IPV6ADDR cp && [ "$cp" ]; }; then f_show_msg "$msg_device_is_not_configured" "$dev" return $FAILURE fi f_dprintf "Network initialized successfully." NETWORK_INITIALIZED=1 return $SUCCESS } # f_media_shutdown_network $device # # Shuts down the configured network device (e.g., `fxp0', `em0', etc.) and # deletes the default route (if configured). Returns failure if the device # passed has not been configured. If not running as init (basically from the # FreeBSD install media) then does nothing and returns success. # f_media_shutdown_network() { local dev="$1" cp f_dprintf "Shutdown called for network device %s" "$dev" if [ ! "$NETWORK_INITIALIZED" ]; then f_dprintf "Network not initialized -- nothing to do." return $SUCCESS fi unset NETWORK_INITIALIZED unset $VAR_NETWORK_DEVICE if ! f_running_as_init; then f_dprintf "Not running as init -- calling the deed done." return $SUCCESS fi f_getvar $VAR_IFCONFIG$dev cp || return $FAILURE f_dprintf "ifconfig %s down" "$dev" f_quietly ifconfig $dev down || f_show_msg "$msg_unable_to_down_the_interface_properly" "$dev" if f_getvar $VAR_GATEWAY cp; then f_dprintf "Deleting default route." f_quietly route -n delete default fi return $SUCCESS } ############################################################ MAIN f_dprintf "%s: Successfully loaded." media/network.subr fi # ! $_MEDIA_NETWORK_SUBR