Index: sys/dev/ixl/if_ixl.c =================================================================== --- sys/dev/ixl/if_ixl.c +++ sys/dev/ixl/if_ixl.c @@ -118,6 +118,8 @@ static void ixl_if_timer(if_ctx_t ctx, uint16_t qid); static void ixl_if_vlan_register(if_ctx_t ctx, u16 vtag); static void ixl_if_vlan_unregister(if_ctx_t ctx, u16 vtag); +static void ixl_if_vxlan_register(if_ctx_t ctx, u16 port); +static void ixl_if_vxlan_unregister(if_ctx_t ctx, u16 port); static uint64_t ixl_if_get_counter(if_ctx_t ctx, ift_counter cnt); static int ixl_if_i2c_req(if_ctx_t ctx, struct ifi2creq *req); static int ixl_if_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data); @@ -190,6 +192,8 @@ DEVMETHOD(ifdi_timer, ixl_if_timer), DEVMETHOD(ifdi_vlan_register, ixl_if_vlan_register), DEVMETHOD(ifdi_vlan_unregister, ixl_if_vlan_unregister), + DEVMETHOD(ifdi_vxlan_register, ixl_if_vxlan_register), + DEVMETHOD(ifdi_vxlan_unregister, ixl_if_vxlan_unregister), DEVMETHOD(ifdi_get_counter, ixl_if_get_counter), DEVMETHOD(ifdi_i2c_req, ixl_if_i2c_req), DEVMETHOD(ifdi_priv_ioctl, ixl_if_priv_ioctl), @@ -726,6 +730,9 @@ /* Add protocol filters to list */ ixl_init_filters(vsi); + /* Initialize udp_ports bitmap for VXLAN offloads */ + memset(&pf->udp_ports, 0, sizeof(pf->udp_ports)); + /* Init queue allocation manager */ error = ixl_pf_qmgr_init(&pf->qmgr, hw->func_caps.num_tx_qp); if (error) { @@ -996,6 +1003,9 @@ /* Re-add configure filters to HW */ ixl_reconfigure_filters(vsi); + /* Sync all UDP filters */ + ixl_sync_udp_filters(pf, true); + /* Configure promiscuous mode */ ixl_if_promisc_set(ctx, if_getflags(ifp)); @@ -1424,6 +1434,9 @@ ixl_process_adminq(pf, &pending); ixl_update_link_status(pf); + if (IXL_PF_HAS_PENDING_UDP_FILTER_SYNC(pf)) + ixl_sync_udp_filters(pf, false); + /* * If there are still messages to process, reschedule ourselves. * Otherwise, re-enable our interrupt and go to sleep. @@ -1729,6 +1742,73 @@ } } +static void +ixl_if_vxlan_register(if_ctx_t ctx, u16 port) +{ + struct ixl_pf *pf = iflib_get_softc(ctx); + int idx; + + /* Check if port already exists */ + idx = ixl_get_udp_port_idx(pf, port); + if (idx != -1) { + device_printf(pf->dev, "port %d already offloaded\n", port); + return; + } + + /* Now check if there is space to add the new port */ + idx = ixl_get_udp_port_idx(pf, 0); + if (idx == -1) { + device_printf(pf->dev, + "maximum number of offloaded UDP ports reached, not adding port %d\n", + port); + return; + } + + pf->udp_ports[idx].port = port; + pf->udp_ports[idx].filter_index = IXL_UDP_PORT_INDEX_UNUSED; + pf->udp_ports[idx].is_marked_for_deletion = FALSE; + pf->pending_udp_bitmap |= BIT_ULL(idx); + + atomic_set_32(&pf->state, IXL_PF_STATE_UDP_FILTER_SYNC_PENDING); + + if (if_getdrvflags(iflib_get_ifp(ctx)) & IFF_DRV_RUNNING) + iflib_admin_intr_deferred(ctx); +} + +static void +ixl_if_vxlan_unregister(if_ctx_t ctx, u16 port) +{ + struct ixl_pf *pf = iflib_get_softc(ctx); + int idx; + + /* Check if port already exists */ + idx = ixl_get_udp_port_idx(pf, port); + if (idx == -1) { + device_printf(pf->dev, + "UDP port %d was not found, not deleting\n", port); + return; + } + + /* If port exists, set the value to 0. When ixl_if_vxlan_register looks for + * an empty entry for a new tunnel, it looks for entries with port set to 0. + * Also, mark current entry for deletion and make the deletion pending. + */ + pf->udp_ports[idx].port = 0; + pf->udp_ports[idx].is_marked_for_deletion = TRUE; + + /* Toggle pending bit instead of setting it. This way if we are + * deleting a port that has yet to be added we just clear the pending + * bit and don't have to worry about it. + */ + pf->pending_udp_bitmap ^= BIT_ULL(idx); + + atomic_set_32(&pf->state, IXL_PF_STATE_UDP_FILTER_SYNC_PENDING); + + if (if_getdrvflags(iflib_get_ifp(ctx)) & IFF_DRV_RUNNING) + iflib_admin_intr_deferred(ctx); +} + + static uint64_t ixl_if_get_counter(if_ctx_t ctx, ift_counter cnt) { @@ -1839,6 +1919,7 @@ { switch (event) { case IFLIB_RESTART_VLAN_CONFIG: + case IFLIB_RESTART_VXLAN_CONFIG: default: return (false); } Index: sys/dev/ixl/ixl.h =================================================================== --- sys/dev/ixl/ixl.h +++ sys/dev/ixl/ixl.h @@ -199,7 +199,12 @@ #define CSUM_OFFLOAD_IPV4 (CSUM_IP|CSUM_TCP|CSUM_UDP|CSUM_SCTP) #define CSUM_OFFLOAD_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6|CSUM_SCTP_IPV6) -#define CSUM_OFFLOAD (CSUM_OFFLOAD_IPV4|CSUM_OFFLOAD_IPV6|CSUM_TSO) +#define CSUM_INNER_IPV4 (CSUM_INNER_IP|CSUM_INNER_IP_TCP|CSUM_INNER_IP_UDP) +#define CSUM_INNER_IPV6 (CSUM_INNER_IP6_TCP|CSUM_INNER_IP6_UDP) +#define CSUM_OFFLOAD (CSUM_OFFLOAD_IPV4|CSUM_OFFLOAD_IPV6|CSUM_TSO| \ + CSUM_ENCAP_VXLAN|CSUM_INNER_IPV4|CSUM_INNER_IPV6| \ + CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO) +#define IXL_NEEDS_CTXD(flags) (flags & (CSUM_TSO | CSUM_ENCAP_VXLAN)) /* Misc flags for ixl_vsi.flags */ #define IXL_FLAGS_KEEP_TSO4 (1 << 0) @@ -261,7 +266,8 @@ IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | \ IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO | \ IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM | \ - IFCAP_VLAN_MTU | IFCAP_JUMBO_MTU | IFCAP_LRO) + IFCAP_VLAN_MTU | IFCAP_JUMBO_MTU | IFCAP_LRO | \ + IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO) #define IXL_CSUM_TCP \ (CSUM_IP_TCP|CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP6_TCP) @@ -271,6 +277,9 @@ (CSUM_IP_SCTP|CSUM_IP6_SCTP) #define IXL_CSUM_IPV4 \ (CSUM_IP|CSUM_IP_TSO) +#define IXL_CSUM_TSO \ + (CSUM_IP_TSO | CSUM_IP6_TSO | \ + CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO) /* Pre-11 counter(9) compatibility */ #if __FreeBSD_version >= 1100036 @@ -404,6 +413,7 @@ /* Stats */ u64 irqs; u64 tso; + u64 tx_vxlan; }; struct ixl_rx_queue { Index: sys/dev/ixl/ixl_pf.h =================================================================== --- sys/dev/ixl/ixl_pf.h +++ sys/dev/ixl/ixl_pf.h @@ -89,6 +89,7 @@ IXL_PF_STATE_FW_LLDP_DISABLED = (1 << 9), IXL_PF_STATE_EEE_ENABLED = (1 << 10), IXL_PF_STATE_LINK_ACTIVE_ON_DOWN = (1 << 11), + IXL_PF_STATE_UDP_FILTER_SYNC_PENDING = (1 << 12), }; #define IXL_PF_IN_RECOVERY_MODE(pf) \ @@ -97,6 +98,19 @@ #define IXL_PF_IS_RESETTING(pf) \ ((atomic_load_acq_32(&pf->state) & IXL_PF_STATE_RESETTING) != 0) +#define IXL_PF_HAS_PENDING_UDP_FILTER_SYNC(pf) \ + ((atomic_load_acq_32(&pf->state) & \ + IXL_PF_STATE_UDP_FILTER_SYNC_PENDING) != 0) + +#define IXL_UDP_PORT_INDEX_UNUSED 255 +struct ixl_udp_port_config { + /* AdminQ command interface expects port number in Host byte order */ + u16 port; + u8 filter_index; + bool is_marked_for_deletion; +}; + + struct ixl_vf { struct ixl_vsi vsi; u32 vf_flags; @@ -172,6 +186,10 @@ int num_vfs; uint16_t veb_seid; int vc_debug_lvl; + + /* VXLAN */ + struct ixl_udp_port_config udp_ports[I40E_MAX_PF_UDP_OFFLOAD_PORTS]; + u16 pending_udp_bitmap; }; /* @@ -438,4 +456,7 @@ int ixl_attach_get_link_status(struct ixl_pf *); int ixl_sysctl_set_flowcntl(SYSCTL_HANDLER_ARGS); +int ixl_get_udp_port_idx(struct ixl_pf *, u16); +void ixl_sync_udp_filters(struct ixl_pf *, bool all); + #endif /* _IXL_PF_H_ */ Index: sys/dev/ixl/ixl_pf_main.c =================================================================== --- sys/dev/ixl/ixl_pf_main.c +++ sys/dev/ixl/ixl_pf_main.c @@ -1031,6 +1031,81 @@ ixl_set_rss_hlut(pf); } +int +ixl_get_udp_port_idx(struct ixl_pf *pf, u16 port) +{ + u8 i; + + for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; ++i) { + /* Do not report ports with pending deletions as + * being available. + */ + if (!port && (pf->pending_udp_bitmap & BIT_ULL(i))) + continue; + if (pf->udp_ports[i].is_marked_for_deletion == TRUE) + continue; + if (pf->udp_ports[i].port == port) + return i; + } + + return -1; +} + +void +ixl_sync_udp_filters(struct ixl_pf *pf, bool all) +{ + struct ixl_udp_port_config *udp_port; + struct i40e_hw *hw = &pf->hw; + enum i40e_status_code status; + bool is_marked_for_deletion; + u8 i, filter_index; + u16 port; + + for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; ++i) { + if (all || pf->pending_udp_bitmap & BIT_ULL(i)) { + status = I40E_SUCCESS; + + udp_port = &pf->udp_ports[i]; + pf->pending_udp_bitmap &= ~BIT_ULL(i); + + port = udp_port->port; + is_marked_for_deletion = udp_port->is_marked_for_deletion; + filter_index = udp_port->filter_index; + + if (!is_marked_for_deletion && port > 0) { + status = i40e_aq_add_udp_tunnel(hw, port, + I40E_AQC_TUNNEL_TYPE_VXLAN, &filter_index, NULL); + } else if (filter_index != IXL_UDP_PORT_INDEX_UNUSED) + status = i40e_aq_del_udp_tunnel(hw, filter_index, NULL); + + if (status) { + device_printf(pf->dev, + "VXLAN %s port %d, index %d failed, err %s aq_err %s\n", + is_marked_for_deletion ? "delete" : "add", + port, filter_index, i40e_stat_str(&pf->hw, status), + i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); + + if (is_marked_for_deletion == FALSE) { + /* failed to add, just reset port, + * drop pending bit for any deletion + */ + udp_port->port = 0; + pf->pending_udp_bitmap &= ~BIT_ULL(i); + } + } else { + if (is_marked_for_deletion == FALSE) { + /* record filter index on success */ + udp_port->filter_index = filter_index; + } + /* clear the deletion flag */ + udp_port->is_marked_for_deletion = FALSE; + } + } + } + + atomic_clear_32(&pf->state, IXL_PF_STATE_UDP_FILTER_SYNC_PENDING); +} + /* * In some firmware versions there is default MAC/VLAN filter * configured which interferes with filters managed by driver. Index: sys/dev/ixl/ixl_txrx.c =================================================================== --- sys/dev/ixl/ixl_txrx.c +++ sys/dev/ixl/ixl_txrx.c @@ -272,13 +272,18 @@ } } -/********************************************************************** - * - * Setup context for hardware segmentation offload (TSO) +#if defined(INET6) || defined(INET) +/** + * Setup context descriptor for TSO or VXLAN Offload + * @txr: TX ring which handles transmission + * @pi: information extracted from packet headers * - **********************************************************************/ + * Configure TX descriptor with information extracted + * from a packet header required for HW to calculate + * requested checksum and perform TCP segmentation. + */ static int -ixl_tso_setup(struct tx_ring *txr, if_pkt_info_t pi) +ixl_ctxd_setup(struct tx_ring *txr, if_pkt_info_t pi) { if_softc_ctx_t scctx; struct i40e_tx_context_desc *TXD; @@ -288,43 +293,74 @@ idx = pi->ipi_pidx; TXD = (struct i40e_tx_context_desc *) &txr->tx_base[idx]; - total_hdr_len = pi->ipi_ehdrlen + pi->ipi_ip_hlen + pi->ipi_tcp_hlen; - tsolen = pi->ipi_len - total_hdr_len; scctx = txr->que->vsi->shared; + type_cmd_tso_mss = + (u64)I40E_TX_DESC_DTYPE_CONTEXT << I40E_TXD_CTX_QW1_DTYPE_SHIFT; + type = I40E_TX_DESC_DTYPE_CONTEXT; - cmd = I40E_TX_CTX_DESC_TSO; - /* - * TSO MSS must not be less than 64; this prevents a - * BAD_LSO_MSS MDD event when the MSS is too small. - */ - if (pi->ipi_tso_segsz < IXL_MIN_TSO_MSS) { - txr->mss_too_small++; - pi->ipi_tso_segsz = IXL_MIN_TSO_MSS; + if (pi->ipi_csum_flags & IXL_CSUM_TSO) { + cmd = I40E_TX_CTX_DESC_TSO; + /* + * TSO MSS must not be less than 64; this prevents a + * BAD_LSO_MSS MDD event when the MSS is too small. + */ + if (pi->ipi_tso_segsz < IXL_MIN_TSO_MSS) { + txr->mss_too_small++; + pi->ipi_tso_segsz = IXL_MIN_TSO_MSS; + } + mss = pi->ipi_tso_segsz; + + total_hdr_len = pi->ipi_ehdrlen + + pi->ipi_ip_hlen + pi->ipi_tcp_hlen + + pi->ipi_outer_ip_hlen + pi->ipi_tun_hlen; + tsolen = pi->ipi_len - total_hdr_len; + + /* Check for BAD_LS0_MSS MDD event (mss too large) */ + MPASS(mss <= IXL_MAX_TSO_MSS); + /* Check for NO_HEAD MDD event (header lengths are 0) */ + MPASS(pi->ipi_ehdrlen != 0); + MPASS(pi->ipi_ip_hlen != 0); + /* Partial check for BAD_LSO_LEN MDD event */ + MPASS(tsolen != 0); + /* Partial check for WRONG_SIZE MDD event (during TSO) */ + MPASS(total_hdr_len + mss <= IXL_MAX_FRAME); + + type_cmd_tso_mss |= + ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) | + ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | + ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT); + txr->que->tso++; } - mss = pi->ipi_tso_segsz; - - /* Check for BAD_LS0_MSS MDD event (mss too large) */ - MPASS(mss <= IXL_MAX_TSO_MSS); - /* Check for NO_HEAD MDD event (header lengths are 0) */ - MPASS(pi->ipi_ehdrlen != 0); - MPASS(pi->ipi_ip_hlen != 0); - /* Partial check for BAD_LSO_LEN MDD event */ - MPASS(tsolen != 0); - /* Partial check for WRONG_SIZE MDD event (during TSO) */ - MPASS(total_hdr_len + mss <= IXL_MAX_FRAME); - - type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) | - ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) | - ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | - ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT); TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss); - TXD->tunneling_params = htole32(0); - txr->que->tso++; + if (pi->ipi_csum_flags & CSUM_ENCAP_VXLAN) { + u32 tun_params = I40E_TXD_CTX_UDP_TUNNELING; + switch (pi->ipi_outer_etype) { + case ETHERTYPE_IP: + if (pi->ipi_csum_flags & CSUM_INNER_IP_TSO) + tun_params |= I40E_TX_CTX_EXT_IP_IPV4; + else + tun_params |= I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM; + break; + case ETHERTYPE_IPV6: + tun_params |= I40E_TX_CTX_EXT_IP_IPV6; + break; + default: + break; + } + tun_params |= + (pi->ipi_outer_ip_hlen >> 2) << I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT | + (pi->ipi_tun_hlen >> 1) << I40E_TXD_CTX_QW0_NATLEN_SHIFT; + TXD->tunneling_params = htole32(tun_params); + txr->que->tx_vxlan++; + } else + TXD->tunneling_params = htole32(0); + return ((idx + 1) & (scctx->isc_ntxd[0]-1)); } +#endif /********************************************************************* * @@ -355,12 +391,12 @@ /* Set up the TSO/CSUM offload */ if (pi->ipi_csum_flags & CSUM_OFFLOAD) { - /* Set up the TSO context descriptor if required */ - if (pi->ipi_csum_flags & CSUM_TSO) { + /* Set up the context descriptor if required */ + if (IXL_NEEDS_CTXD(pi->ipi_csum_flags)) { /* Prevent MAX_BUFF MDD event (for TSO) */ if (ixl_tso_detect_sparse(segs, nsegs, pi)) return (EFBIG); - i = ixl_tso_setup(txr, pi); + i = ixl_ctxd_setup(txr, pi); } ixl_tx_setup_offload(que, pi, &cmd, &off); } @@ -730,13 +766,17 @@ return (0); } -/********************************************************************* - * - * Verify that the hardware indicated that the checksum is valid. - * Inform the stack about the status of checksum so that stack - * doesn't spend time verifying the checksum. +/** + * ixl_rx_checksum - Verify that the hardware indicated that the checksum is valid or not + * @ri: iflib RXD info + * @status: RX descriptor status data + * @error: RX descriptor error data + * @ptype: packet type * - *********************************************************************/ + * Determine whether the hardware indicated that RX checksums were verified + * and are valid. Inform the stack about the status of checksum so that stack + * doesn't spend time verifying them. + */ static u8 ixl_rx_checksum(if_rxd_info_t ri, u32 status, u32 error, u8 ptype) { @@ -750,6 +790,10 @@ decoded = decode_rx_desc_ptype(ptype); + /* Cannot proceed if packet type is unknown or not an IP packet */ + if (decoded.known == 0 || decoded.outer_ip != I40E_RX_PTYPE_OUTER_IP) + return (0); + /* IPv6 with extension headers likely have bad csum */ if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP && decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) { @@ -760,21 +804,68 @@ } } - ri->iri_csum_flags |= CSUM_L3_CALC; - - /* IPv4 checksum error */ - if (error & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) - return (1); - - ri->iri_csum_flags |= CSUM_L3_VALID; - ri->iri_csum_flags |= CSUM_L4_CALC; - - /* L4 checksum error */ - if (error & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) - return (1); + switch (decoded.tunnel_type) { + case I40E_RX_PTYPE_TUNNEL_NONE: + /* L3 checksum is calculated only for IPv4 packets */ + if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) { + ri->iri_csum_flags |= CSUM_L3_CALC; + /* IPv4 checksum error */ + if (error & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) + return (1); + ri->iri_csum_flags |= CSUM_L3_VALID; + } - ri->iri_csum_flags |= CSUM_L4_VALID; - ri->iri_csum_data |= htons(0xffff); + switch (decoded.inner_prot) { + case I40E_RX_PTYPE_INNER_PROT_UDP: + case I40E_RX_PTYPE_INNER_PROT_TCP: + case I40E_RX_PTYPE_INNER_PROT_SCTP: + ri->iri_csum_flags |= CSUM_L4_CALC; + /* L4 checksum error */ + if (error & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) + return (1); + ri->iri_csum_flags |= CSUM_L4_VALID; + ri->iri_csum_data |= htons(0xffff); + break; + default: + break; + } + break; + case I40E_RX_PTYPE_TUNNEL_IP_GRENAT: + case I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC: + case I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN: + /* L3 checksum of outer IPv4 packets */ + if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) { + ri->iri_csum_flags = CSUM_L3_CALC; + /* IP checksum error */ + if (error & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) + return (1); + ri->iri_csum_flags |= CSUM_L3_VALID; + } + /* L3 checksum of most inner IPv4 packets */ + if (decoded.tunnel_end_prot == I40E_RX_PTYPE_TUNNEL_END_IPV4) { + ri->iri_csum_flags = CSUM_INNER_L3_CALC; + /* IP checksum error */ + if (error & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) + return (1); + ri->iri_csum_flags |= CSUM_INNER_L3_VALID; + } + switch (decoded.inner_prot) { + case I40E_RX_PTYPE_INNER_PROT_UDP: + case I40E_RX_PTYPE_INNER_PROT_TCP: + case I40E_RX_PTYPE_INNER_PROT_SCTP: + ri->iri_csum_flags |= CSUM_INNER_L4_CALC; + if (error & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) + return (1); + ri->iri_csum_flags |= CSUM_INNER_L4_VALID; + ri->iri_csum_data |= htons(0xffff); + break; + default: + break; + } + break; + default: + break; + } return (0); } @@ -950,6 +1041,9 @@ SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tso", CTLFLAG_RD, &(tx_que->tso), "TSO"); + SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_vxlan", + CTLFLAG_RD, &(tx_que->tx_vxlan), + "VXLAN HW Offload"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "mss_too_small", CTLFLAG_RD, &(txr->mss_too_small), "TSO sends with an MSS less than 64"); Index: sys/net/ifdi_if.m =================================================================== --- sys/net/ifdi_if.m +++ sys/net/ifdi_if.m @@ -97,6 +97,11 @@ { } + static void + null_vxlan_register_op(if_ctx_t _ctx __unused, uint16_t port __unused) + { + } + static int null_q_setup(if_ctx_t _ctx __unused, uint32_t _qid __unused) { @@ -458,6 +463,16 @@ uint16_t _vtag; } DEFAULT null_vlan_register_op; +METHOD void vxlan_register { + if_ctx_t _ctx; + uint16_t _port; +} DEFAULT null_vxlan_register_op; + +METHOD void vxlan_unregister { + if_ctx_t _ctx; + uint16_t _port; +} DEFAULT null_vxlan_register_op; + METHOD int sysctl_int_delay { if_ctx_t _sctx; if_int_delay_info_t _iidi; Index: sys/net/iflib.h =================================================================== --- sys/net/iflib.h +++ sys/net/iflib.h @@ -131,7 +131,11 @@ uint8_t ipi_mflags; /* packet mbuf flags */ uint32_t ipi_tcp_seq; /* tcp seqno */ - uint32_t __spare0__; + + /* Tunneled packets offload handling */ + uint16_t ipi_outer_etype; /* outer ether header length */ + uint8_t ipi_outer_ip_hlen; /* outer ip header length */ + uint8_t ipi_tun_hlen; /* tunnel headers length */ } *if_pkt_info_t; typedef struct if_irq { @@ -410,6 +414,7 @@ */ enum iflib_restart_event { IFLIB_RESTART_VLAN_CONFIG, + IFLIB_RESTART_VXLAN_CONFIG, }; /* Index: sys/net/iflib.c =================================================================== --- sys/net/iflib.c +++ sys/net/iflib.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ #include #include #include +#include #include #include @@ -211,6 +213,8 @@ #define isc_legacy_intr ifc_txrx.ift_legacy_intr eventhandler_tag ifc_vlan_attach_event; eventhandler_tag ifc_vlan_detach_event; + eventhandler_tag ifc_vxlan_attach_event; + eventhandler_tag ifc_vxlan_detach_event; struct ether_addr ifc_mac; }; @@ -708,7 +712,7 @@ #endif static int iflib_register(if_ctx_t); static void iflib_deregister(if_ctx_t); -static void iflib_unregister_vlan_handlers(if_ctx_t ctx); +static void iflib_unregister_event_handlers(if_ctx_t ctx); static uint16_t iflib_get_mbuf_size_for(unsigned int size); static void iflib_init_locked(if_ctx_t ctx); static void iflib_add_device_sysctl_pre(if_ctx_t ctx); @@ -2464,6 +2468,7 @@ iflib_txq_t txq; iflib_rxq_t rxq; int i, j, tx_ip_csum_flags, tx_ip6_csum_flags; + int vxlan_hwcsum_flags, vxlan_hwtso_flags; if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); IFDI_INTR_DISABLE(ctx); @@ -2476,6 +2481,12 @@ tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP); + /* CSUM_ENCAP_VXLAN has to be set if any of VXLAN offloads is enabled */ + vxlan_hwcsum_flags = scctx->isc_tx_csum_flags & (CSUM_INNER_IP6_UDP | + CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP | + CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_ENCAP_VXLAN); + vxlan_hwtso_flags = scctx->isc_tx_csum_flags & (CSUM_INNER_IP6_TSO | CSUM_INNER_IP_TSO | + CSUM_ENCAP_VXLAN); /* Set hardware offload abilities */ if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) @@ -2486,6 +2497,10 @@ if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); + if (if_getcapenable(ifp) & IFCAP_VXLAN_HWCSUM) + if_sethwassistbits(ifp, vxlan_hwcsum_flags, 0); + if (if_getcapenable(ifp) & IFCAP_VXLAN_HWTSO) + if_sethwassistbits(ifp, vxlan_hwtso_flags, 0); for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) { CALLOUT_LOCK(txq); @@ -3157,13 +3172,178 @@ pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag); printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n", pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto); + printf("pi outer_etype: %d outer_ip_len: %d tun_len: %d\n", + pi->ipi_outer_etype, pi->ipi_outer_ip_hlen, pi->ipi_tun_hlen); } #endif #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO) #define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO)) +#define IS_TX_INNER_OFFLOAD4(pi) ((pi)->ipi_csum_flags & \ + (CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO)) +#define IS_INNER_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_INNER_IP_TSO) #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO) #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO)) +#define IS_TX_INNER_OFFLOAD6(pi) ((pi)->ipi_csum_flags & \ + (CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO)) +#define IS_INNER_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_INNER_IP6_TSO) + +static int +iflib_parse_inner_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp) +{ + if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx; + struct ether_vlan_header *eh; + struct ip *outer_ip; + int ehdrlen; + struct mbuf *m; + size_t off; + + if (pi->ipi_ipproto != IPPROTO_UDP) + return (ENXIO); + + m = *mp; + /* + * Save outer frame info and reuse etype and ip_hlen for inner frame. + */ + pi->ipi_outer_etype = pi->ipi_etype; + pi->ipi_outer_ip_hlen = pi->ipi_ip_hlen; + pi->ipi_tun_hlen = sizeof(struct udphdr) + sizeof(struct vxlan_header); + + /* size of outer frame header */ + off = pi->ipi_ehdrlen + pi->ipi_ip_hlen + pi->ipi_tun_hlen; + outer_ip = (struct ip *)((caddr_t)m->m_data + pi->ipi_ehdrlen); + + /* For VXLAN first mbuf usually contains only outer frame headers */ + if (m->m_len == off) { + m = m->m_next; + off = 0; + } + + if (__predict_false((size_t)m->m_len < off + sizeof(*eh))) + return (ENOMEM); + + eh = (struct ether_vlan_header *)((caddr_t)m->m_data + off); + if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { + pi->ipi_etype = ntohs(eh->evl_proto); + ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + } else { + pi->ipi_etype = ntohs(eh->evl_encap_proto); + ehdrlen = ETHER_HDR_LEN; + } + pi->ipi_tun_hlen += ehdrlen; + + switch (pi->ipi_etype) { +#ifdef INET + case ETHERTYPE_IP: + { + struct ip *ip = NULL; + struct tcphdr *th = NULL; + int minhlen = off + ehdrlen + sizeof(*ip); + + if (m->m_pkthdr.csum_flags & (CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO)) + minhlen += sizeof(*th); + minhlen = min(m->m_pkthdr.len, minhlen); + if (__predict_false(m->m_len < minhlen)) { + txq->ift_pullups++; + if (__predict_false((m = m_pullup(m, minhlen)) == NULL)) + return (ENOMEM); + } + ip = (struct ip *)(m->m_data + ehdrlen); + if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th)) + th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + + pi->ipi_ip_hlen = ip->ip_hl << 2; + pi->ipi_ipproto = ip->ip_p; + + /* TCP checksum offload may require TCP header length */ + if (IS_TX_INNER_OFFLOAD4(pi)) { + if (__predict_false(ip->ip_p != IPPROTO_TCP)) + return (ENXIO); + + if (__predict_false(th == NULL)) { + txq->ift_pullups++; + if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL)) + return (ENOMEM); + th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen); + } + pi->ipi_tcp_hflags = th->th_flags; + pi->ipi_tcp_hlen = th->th_off << 2; + pi->ipi_tcp_seq = th->th_seq; + + if (IS_INNER_TSO4(pi)) { + /* + * TSO always requires hardware checksum offload. + */ + pi->ipi_csum_flags |= (CSUM_INNER_IP_TCP | CSUM_INNER_IP); + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; + if (sctx->isc_flags & IFLIB_TSO_INIT_IP) { + ip->ip_sum = 0; + ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz); + } + } + } + if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_INNER_IP)) { + ip->ip_sum = 0; + outer_ip->ip_sum = 0; + } + break; + } +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + { + struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen); + struct tcphdr *th; + pi->ipi_ip_hlen = sizeof(struct ip6_hdr); + + if (__predict_false(m->m_len < ehdrlen + sizeof(struct ip6_hdr))) { + txq->ift_pullups++; + if (__predict_false((m = m_pullup(m, ehdrlen + sizeof(struct ip6_hdr))) == NULL)) + return (ENOMEM); + } + th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen); + + /* XXX-BZ this will go badly in case of ext hdrs. */ + pi->ipi_ipproto = ip6->ip6_nxt; + pi->ipi_flags |= IPI_TX_IPV6; + + /* TCP checksum offload may require TCP header length */ + if (IS_TX_INNER_OFFLOAD6(pi)) { + if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP)) + return (ENXIO); + + if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) { + txq->ift_pullups++; + if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL)) + return (ENOMEM); + } + pi->ipi_tcp_hflags = th->th_flags; + pi->ipi_tcp_hlen = th->th_off << 2; + pi->ipi_tcp_seq = th->th_seq; + + if (IS_INNER_TSO6(pi)) { + /* + * TSO always requires hardware checksum offload. + */ + pi->ipi_csum_flags |= CSUM_INNER_IP6_TCP; + th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); + pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz; + } + } + break; + } +#endif + default: + pi->ipi_csum_flags &= ~CSUM_OFFLOAD; + pi->ipi_ip_hlen = 0; + break; + } + *mp = m; + + return (0); +} static int iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp) @@ -3210,9 +3390,11 @@ struct mbuf *n; struct ip *ip = NULL; struct tcphdr *th = NULL; - int minthlen; + int minthlen = pi->ipi_ehdrlen + sizeof(*ip); - minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th)); + if ((m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN) == 0) + minthlen += sizeof(*th); + minthlen = min(m->m_pkthdr.len, minthlen); if (__predict_false(m->m_len < minthlen)) { /* * if this code bloat is causing too much of a hit @@ -3334,6 +3516,9 @@ } *mp = m; + if ((m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN) != 0) + return iflib_parse_inner_header(txq, pi, mp); + return (0); } @@ -4273,7 +4458,8 @@ #define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \ IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \ IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \ - IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG) + IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG | \ + IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO) static int iflib_if_ioctl(if_t ifp, u_long command, caddr_t data) @@ -4515,6 +4701,50 @@ CTX_UNLOCK(ctx); } +static void +iflib_vxlan_register(void *arg, if_t ifp, sa_family_t family, uint16_t port) +{ + if_ctx_t ctx = arg; + + MPASS(family == AF_INET || family == AF_INET6); + + if (iflib_in_detach(ctx)) + return; + + /* Check if interface has VXLAN offloads enabled */ + if (!(if_getcapenable(ctx->ifc_ifp) & + (IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO))) + return; + + CTX_LOCK(ctx); + /* Driver may need to stop traffic before enabling VXLAN offload */ + if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VXLAN_CONFIG)) + iflib_stop(ctx); + IFDI_VXLAN_REGISTER(ctx, port); + /* Re-init to load the changes, if required */ + if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VXLAN_CONFIG)) + iflib_init_locked(ctx); + CTX_UNLOCK(ctx); +} + +static void +iflib_vxlan_unregister(void *arg, if_t ifp, sa_family_t family, uint16_t port) +{ + if_ctx_t ctx = arg; + + MPASS(family == AF_INET || family == AF_INET6); + + CTX_LOCK(ctx); + /* Driver may need all tagged packets to be flushed */ + if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VXLAN_CONFIG)) + iflib_stop(ctx); + IFDI_VXLAN_UNREGISTER(ctx, port); + /* Re-init to load the changes, if required */ + if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VXLAN_CONFIG)) + iflib_init_locked(ctx); + CTX_UNLOCK(ctx); +} + static void iflib_led_func(void *arg, int onoff) { @@ -5458,7 +5688,7 @@ if_shared_ctx_t sctx = ctx->ifc_sctx; /* Unregister VLAN event handlers early */ - iflib_unregister_vlan_handlers(ctx); + iflib_unregister_event_handlers(ctx); if ((sctx->isc_flags & IFLIB_PSEUDO) && (sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0) { @@ -5518,8 +5748,8 @@ ctx->ifc_flags |= IFC_IN_DETACH; STATE_UNLOCK(ctx); - /* Unregister VLAN handlers before calling iflib_stop() */ - iflib_unregister_vlan_handlers(ctx); + /* Unregister VLAN and VXLAN handlers before calling iflib_stop() */ + iflib_unregister_event_handlers(ctx); iflib_netmap_detach(ifp); ether_ifdetach(ifp); @@ -5835,6 +6065,12 @@ ctx->ifc_vlan_detach_event = EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx, EVENTHANDLER_PRI_FIRST); + ctx->ifc_vxlan_attach_event = + EVENTHANDLER_REGISTER(vxlan_start, iflib_vxlan_register, ctx, + EVENTHANDLER_PRI_FIRST); + ctx->ifc_vxlan_detach_event = + EVENTHANDLER_REGISTER(vxlan_stop, iflib_vxlan_unregister, ctx, + EVENTHANDLER_PRI_FIRST); if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) { ctx->ifc_mediap = &ctx->ifc_media; @@ -5845,7 +6081,7 @@ } static void -iflib_unregister_vlan_handlers(if_ctx_t ctx) +iflib_unregister_event_handlers(if_ctx_t ctx) { /* Unregister VLAN events */ if (ctx->ifc_vlan_attach_event != NULL) { @@ -5857,6 +6093,15 @@ ctx->ifc_vlan_detach_event = NULL; } + /* Unregister VxLAN events */ + if (ctx->ifc_vxlan_attach_event != NULL) { + EVENTHANDLER_DEREGISTER(vxlan_start, ctx->ifc_vxlan_attach_event); + ctx->ifc_vxlan_attach_event = NULL; + } + if (ctx->ifc_vxlan_detach_event != NULL) { + EVENTHANDLER_DEREGISTER(vxlan_stop, ctx->ifc_vxlan_detach_event); + ctx->ifc_vxlan_detach_event = NULL; + } } static void @@ -5867,8 +6112,8 @@ /* Remove all media */ ifmedia_removeall(&ctx->ifc_media); - /* Ensure that VLAN event handlers are unregistered */ - iflib_unregister_vlan_handlers(ctx); + /* Ensure that VLAN and VXLAN event handlers are unregistered */ + iflib_unregister_event_handlers(ctx); /* Release kobject reference */ kobj_delete((kobj_t) ctx, NULL);