diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c index 505a62b01b4e..41eaa6a56086 100644 --- a/sys/dev/virtio/network/if_vtnet.c +++ b/sys/dev/virtio/network/if_vtnet.c @@ -1,4460 +1,4456 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver for VirtIO network devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_if.h" #include "opt_inet.h" #include "opt_inet6.h" #if defined(INET) || defined(INET6) #include #endif static int vtnet_modevent(module_t, int, void *); static int vtnet_probe(device_t); static int vtnet_attach(device_t); static int vtnet_detach(device_t); static int vtnet_suspend(device_t); static int vtnet_resume(device_t); static int vtnet_shutdown(device_t); static int vtnet_attach_completed(device_t); static int vtnet_config_change(device_t); static int vtnet_negotiate_features(struct vtnet_softc *); static int vtnet_setup_features(struct vtnet_softc *); static int vtnet_init_rxq(struct vtnet_softc *, int); static int vtnet_init_txq(struct vtnet_softc *, int); static int vtnet_alloc_rxtx_queues(struct vtnet_softc *); static void vtnet_free_rxtx_queues(struct vtnet_softc *); static int vtnet_alloc_rx_filters(struct vtnet_softc *); static void vtnet_free_rx_filters(struct vtnet_softc *); static int vtnet_alloc_virtqueues(struct vtnet_softc *); static int vtnet_alloc_interface(struct vtnet_softc *); static int vtnet_setup_interface(struct vtnet_softc *); static int vtnet_ioctl_mtu(struct vtnet_softc *, u_int); static int vtnet_ioctl_ifflags(struct vtnet_softc *); static int vtnet_ioctl_multi(struct vtnet_softc *); static int vtnet_ioctl_ifcap(struct vtnet_softc *, struct ifreq *); static int vtnet_ioctl(if_t, u_long, caddr_t); static uint64_t vtnet_get_counter(if_t, ift_counter); static int vtnet_rxq_populate(struct vtnet_rxq *); static void vtnet_rxq_free_mbufs(struct vtnet_rxq *); static struct mbuf * vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **); static int vtnet_rxq_replace_lro_nomrg_buf(struct vtnet_rxq *, struct mbuf *, int); static int vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int); static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *); static int vtnet_rxq_new_buf(struct vtnet_rxq *); static int vtnet_rxq_csum_needs_csum(struct vtnet_rxq *, struct mbuf *, uint16_t, int, struct virtio_net_hdr *); static int vtnet_rxq_csum_data_valid(struct vtnet_rxq *, struct mbuf *, uint16_t, int, struct virtio_net_hdr *); static int vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int); static void vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *); static int vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int); static void vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *, struct virtio_net_hdr *); static int vtnet_rxq_eof(struct vtnet_rxq *); static void vtnet_rx_vq_process(struct vtnet_rxq *rxq, int tries); static void vtnet_rx_vq_intr(void *); static void vtnet_rxq_tq_intr(void *, int); static int vtnet_txq_intr_threshold(struct vtnet_txq *); static int vtnet_txq_below_threshold(struct vtnet_txq *); static int vtnet_txq_notify(struct vtnet_txq *); static void vtnet_txq_free_mbufs(struct vtnet_txq *); static int vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *, int *, int *, int *); static int vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int, int, struct virtio_net_hdr *); static struct mbuf * vtnet_txq_offload(struct vtnet_txq *, struct mbuf *, struct virtio_net_hdr *); static int vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **, struct vtnet_tx_header *); static int vtnet_txq_encap(struct vtnet_txq *, struct mbuf **, int); #ifdef VTNET_LEGACY_TX static void vtnet_start_locked(struct vtnet_txq *, if_t); static void vtnet_start(if_t); #else static int vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *); static int vtnet_txq_mq_start(if_t, struct mbuf *); static void vtnet_txq_tq_deferred(void *, int); #endif static void vtnet_txq_start(struct vtnet_txq *); static void vtnet_txq_tq_intr(void *, int); static int vtnet_txq_eof(struct vtnet_txq *); static void vtnet_tx_vq_intr(void *); static void vtnet_tx_start_all(struct vtnet_softc *); #ifndef VTNET_LEGACY_TX static void vtnet_qflush(if_t); #endif static int vtnet_watchdog(struct vtnet_txq *); static void vtnet_accum_stats(struct vtnet_softc *, struct vtnet_rxq_stats *, struct vtnet_txq_stats *); static void vtnet_tick(void *); static void vtnet_start_taskqueues(struct vtnet_softc *); static void vtnet_free_taskqueues(struct vtnet_softc *); static void vtnet_drain_taskqueues(struct vtnet_softc *); static void vtnet_drain_rxtx_queues(struct vtnet_softc *); static void vtnet_stop_rendezvous(struct vtnet_softc *); static void vtnet_stop(struct vtnet_softc *); static int vtnet_virtio_reinit(struct vtnet_softc *); static void vtnet_init_rx_filters(struct vtnet_softc *); static int vtnet_init_rx_queues(struct vtnet_softc *); static int vtnet_init_tx_queues(struct vtnet_softc *); static int vtnet_init_rxtx_queues(struct vtnet_softc *); static void vtnet_set_active_vq_pairs(struct vtnet_softc *); static void vtnet_update_rx_offloads(struct vtnet_softc *); static int vtnet_reinit(struct vtnet_softc *); static void vtnet_init_locked(struct vtnet_softc *, int); static void vtnet_init(void *); static void vtnet_free_ctrl_vq(struct vtnet_softc *); static void vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *, struct sglist *, int, int); static int vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *); static int vtnet_ctrl_guest_offloads(struct vtnet_softc *, uint64_t); static int vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t); static int vtnet_ctrl_rx_cmd(struct vtnet_softc *, uint8_t, bool); static int vtnet_set_promisc(struct vtnet_softc *, bool); static int vtnet_set_allmulti(struct vtnet_softc *, bool); static void vtnet_rx_filter(struct vtnet_softc *); static void vtnet_rx_filter_mac(struct vtnet_softc *); static int vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_rx_filter_vlan(struct vtnet_softc *); static void vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t); static void vtnet_register_vlan(void *, if_t, uint16_t); static void vtnet_unregister_vlan(void *, if_t, uint16_t); static void vtnet_update_speed_duplex(struct vtnet_softc *); static int vtnet_is_link_up(struct vtnet_softc *); static void vtnet_update_link_status(struct vtnet_softc *); static int vtnet_ifmedia_upd(if_t); static void vtnet_ifmedia_sts(if_t, struct ifmediareq *); static void vtnet_get_macaddr(struct vtnet_softc *); static void vtnet_set_macaddr(struct vtnet_softc *); static void vtnet_attached_set_macaddr(struct vtnet_softc *); static void vtnet_vlan_tag_remove(struct mbuf *); static void vtnet_set_rx_process_limit(struct vtnet_softc *); static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *, struct sysctl_oid_list *, struct vtnet_rxq *); static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *, struct sysctl_oid_list *, struct vtnet_txq *); static void vtnet_setup_queue_sysctl(struct vtnet_softc *); static void vtnet_load_tunables(struct vtnet_softc *); static void vtnet_setup_sysctl(struct vtnet_softc *); static int vtnet_rxq_enable_intr(struct vtnet_rxq *); static void vtnet_rxq_disable_intr(struct vtnet_rxq *); static int vtnet_txq_enable_intr(struct vtnet_txq *); static void vtnet_txq_disable_intr(struct vtnet_txq *); static void vtnet_enable_rx_interrupts(struct vtnet_softc *); static void vtnet_enable_tx_interrupts(struct vtnet_softc *); static void vtnet_enable_interrupts(struct vtnet_softc *); static void vtnet_disable_rx_interrupts(struct vtnet_softc *); static void vtnet_disable_tx_interrupts(struct vtnet_softc *); static void vtnet_disable_interrupts(struct vtnet_softc *); static int vtnet_tunable_int(struct vtnet_softc *, const char *, int); DEBUGNET_DEFINE(vtnet); #define vtnet_htog16(_sc, _val) virtio_htog16(vtnet_modern(_sc), _val) #define vtnet_htog32(_sc, _val) virtio_htog32(vtnet_modern(_sc), _val) #define vtnet_htog64(_sc, _val) virtio_htog64(vtnet_modern(_sc), _val) #define vtnet_gtoh16(_sc, _val) virtio_gtoh16(vtnet_modern(_sc), _val) #define vtnet_gtoh32(_sc, _val) virtio_gtoh32(vtnet_modern(_sc), _val) #define vtnet_gtoh64(_sc, _val) virtio_gtoh64(vtnet_modern(_sc), _val) /* Tunables. */ static SYSCTL_NODE(_hw, OID_AUTO, vtnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VirtIO Net driver parameters"); static int vtnet_csum_disable = 0; SYSCTL_INT(_hw_vtnet, OID_AUTO, csum_disable, CTLFLAG_RDTUN, &vtnet_csum_disable, 0, "Disables receive and send checksum offload"); static int vtnet_fixup_needs_csum = 0; SYSCTL_INT(_hw_vtnet, OID_AUTO, fixup_needs_csum, CTLFLAG_RDTUN, &vtnet_fixup_needs_csum, 0, "Calculate valid checksum for NEEDS_CSUM packets"); static int vtnet_tso_disable = 0; SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_disable, CTLFLAG_RDTUN, &vtnet_tso_disable, 0, "Disables TSO"); static int vtnet_lro_disable = 0; SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_disable, CTLFLAG_RDTUN, &vtnet_lro_disable, 0, "Disables hardware LRO"); static int vtnet_mq_disable = 0; SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_disable, CTLFLAG_RDTUN, &vtnet_mq_disable, 0, "Disables multiqueue support"); static int vtnet_mq_max_pairs = VTNET_MAX_QUEUE_PAIRS; SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_max_pairs, CTLFLAG_RDTUN, &vtnet_mq_max_pairs, 0, "Maximum number of multiqueue pairs"); static int vtnet_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &vtnet_tso_maxlen, 0, "TSO burst limit"); static int vtnet_rx_process_limit = 1024; SYSCTL_INT(_hw_vtnet, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &vtnet_rx_process_limit, 0, "Number of RX segments processed in one pass"); static int vtnet_lro_entry_count = 128; SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &vtnet_lro_entry_count, 0, "Software LRO entry count"); /* Enable sorted LRO, and the depth of the mbuf queue. */ static int vtnet_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_vtnet, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &vtnet_lro_mbufq_depth, 0, "Depth of software LRO mbuf queue"); static uma_zone_t vtnet_tx_header_zone; static struct virtio_feature_desc vtnet_feature_desc[] = { { VIRTIO_NET_F_CSUM, "TxChecksum" }, { VIRTIO_NET_F_GUEST_CSUM, "RxChecksum" }, { VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, "CtrlRxOffloads" }, { VIRTIO_NET_F_MAC, "MAC" }, { VIRTIO_NET_F_GSO, "TxGSO" }, { VIRTIO_NET_F_GUEST_TSO4, "RxLROv4" }, { VIRTIO_NET_F_GUEST_TSO6, "RxLROv6" }, { VIRTIO_NET_F_GUEST_ECN, "RxLROECN" }, { VIRTIO_NET_F_GUEST_UFO, "RxUFO" }, { VIRTIO_NET_F_HOST_TSO4, "TxTSOv4" }, { VIRTIO_NET_F_HOST_TSO6, "TxTSOv6" }, { VIRTIO_NET_F_HOST_ECN, "TxTSOECN" }, { VIRTIO_NET_F_HOST_UFO, "TxUFO" }, { VIRTIO_NET_F_MRG_RXBUF, "MrgRxBuf" }, { VIRTIO_NET_F_STATUS, "Status" }, { VIRTIO_NET_F_CTRL_VQ, "CtrlVq" }, { VIRTIO_NET_F_CTRL_RX, "CtrlRxMode" }, { VIRTIO_NET_F_CTRL_VLAN, "CtrlVLANFilter" }, { VIRTIO_NET_F_CTRL_RX_EXTRA, "CtrlRxModeExtra" }, { VIRTIO_NET_F_GUEST_ANNOUNCE, "GuestAnnounce" }, { VIRTIO_NET_F_MQ, "Multiqueue" }, { VIRTIO_NET_F_CTRL_MAC_ADDR, "CtrlMacAddr" }, { VIRTIO_NET_F_SPEED_DUPLEX, "SpeedDuplex" }, { 0, NULL } }; static device_method_t vtnet_methods[] = { /* Device methods. */ DEVMETHOD(device_probe, vtnet_probe), DEVMETHOD(device_attach, vtnet_attach), DEVMETHOD(device_detach, vtnet_detach), DEVMETHOD(device_suspend, vtnet_suspend), DEVMETHOD(device_resume, vtnet_resume), DEVMETHOD(device_shutdown, vtnet_shutdown), /* VirtIO methods. */ DEVMETHOD(virtio_attach_completed, vtnet_attach_completed), DEVMETHOD(virtio_config_change, vtnet_config_change), DEVMETHOD_END }; #ifdef DEV_NETMAP #include #endif static driver_t vtnet_driver = { .name = "vtnet", .methods = vtnet_methods, .size = sizeof(struct vtnet_softc) }; VIRTIO_DRIVER_MODULE(vtnet, vtnet_driver, vtnet_modevent, NULL); MODULE_VERSION(vtnet, 1); MODULE_DEPEND(vtnet, virtio, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(vtnet, netmap, 1, 1, 1); #endif VIRTIO_SIMPLE_PNPINFO(vtnet, VIRTIO_ID_NETWORK, "VirtIO Networking Adapter"); static int vtnet_modevent(module_t mod __unused, int type, void *unused __unused) { int error = 0; static int loaded = 0; switch (type) { case MOD_LOAD: if (loaded++ == 0) { vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr", sizeof(struct vtnet_tx_header), NULL, NULL, NULL, NULL, 0, 0); #ifdef DEBUGNET /* * We need to allocate from this zone in the transmit path, so ensure * that we have at least one item per header available. * XXX add a separate zone like we do for mbufs? otherwise we may alloc * buckets */ uma_zone_reserve(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2); uma_prealloc(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2); #endif } break; case MOD_QUIESCE: if (uma_zone_get_cur(vtnet_tx_header_zone) > 0) error = EBUSY; break; case MOD_UNLOAD: if (--loaded == 0) { uma_zdestroy(vtnet_tx_header_zone); vtnet_tx_header_zone = NULL; } break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static int vtnet_probe(device_t dev) { return (VIRTIO_SIMPLE_PROBE(dev, vtnet)); } static int vtnet_attach(device_t dev) { struct vtnet_softc *sc; int error; sc = device_get_softc(dev); sc->vtnet_dev = dev; virtio_set_feature_desc(dev, vtnet_feature_desc); VTNET_CORE_LOCK_INIT(sc); callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0); vtnet_load_tunables(sc); error = vtnet_alloc_interface(sc); if (error) { device_printf(dev, "cannot allocate interface\n"); goto fail; } vtnet_setup_sysctl(sc); error = vtnet_setup_features(sc); if (error) { device_printf(dev, "cannot setup features\n"); goto fail; } error = vtnet_alloc_rx_filters(sc); if (error) { device_printf(dev, "cannot allocate Rx filters\n"); goto fail; } error = vtnet_alloc_rxtx_queues(sc); if (error) { device_printf(dev, "cannot allocate queues\n"); goto fail; } error = vtnet_alloc_virtqueues(sc); if (error) { device_printf(dev, "cannot allocate virtqueues\n"); goto fail; } error = vtnet_setup_interface(sc); if (error) { device_printf(dev, "cannot setup interface\n"); goto fail; } error = virtio_setup_intr(dev, INTR_TYPE_NET); if (error) { device_printf(dev, "cannot setup interrupts\n"); ether_ifdetach(sc->vtnet_ifp); goto fail; } #ifdef DEV_NETMAP vtnet_netmap_attach(sc); #endif vtnet_start_taskqueues(sc); fail: if (error) vtnet_detach(dev); return (error); } static int vtnet_detach(device_t dev) { struct vtnet_softc *sc; if_t ifp; sc = device_get_softc(dev); ifp = sc->vtnet_ifp; if (device_is_attached(dev)) { VTNET_CORE_LOCK(sc); vtnet_stop(sc); VTNET_CORE_UNLOCK(sc); callout_drain(&sc->vtnet_tick_ch); vtnet_drain_taskqueues(sc); ether_ifdetach(ifp); } #ifdef DEV_NETMAP netmap_detach(ifp); #endif if (sc->vtnet_pfil != NULL) { pfil_head_unregister(sc->vtnet_pfil); sc->vtnet_pfil = NULL; } vtnet_free_taskqueues(sc); if (sc->vtnet_vlan_attach != NULL) { EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach); sc->vtnet_vlan_attach = NULL; } if (sc->vtnet_vlan_detach != NULL) { EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vtnet_vlan_detach); sc->vtnet_vlan_detach = NULL; } ifmedia_removeall(&sc->vtnet_media); if (ifp != NULL) { if_free(ifp); sc->vtnet_ifp = NULL; } vtnet_free_rxtx_queues(sc); vtnet_free_rx_filters(sc); if (sc->vtnet_ctrl_vq != NULL) vtnet_free_ctrl_vq(sc); VTNET_CORE_LOCK_DESTROY(sc); return (0); } static int vtnet_suspend(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); VTNET_CORE_LOCK(sc); vtnet_stop(sc); sc->vtnet_flags |= VTNET_FLAG_SUSPENDED; VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_resume(device_t dev) { struct vtnet_softc *sc; if_t ifp; sc = device_get_softc(dev); ifp = sc->vtnet_ifp; VTNET_CORE_LOCK(sc); if (if_getflags(ifp) & IFF_UP) vtnet_init_locked(sc, 0); sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED; VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_shutdown(device_t dev) { /* * Suspend already does all of what we need to * do here; we just never expect to be resumed. */ return (vtnet_suspend(dev)); } static int vtnet_attach_completed(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); VTNET_CORE_LOCK(sc); vtnet_attached_set_macaddr(sc); VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_config_change(device_t dev) { struct vtnet_softc *sc; sc = device_get_softc(dev); VTNET_CORE_LOCK(sc); vtnet_update_link_status(sc); if (sc->vtnet_link_active != 0) vtnet_tx_start_all(sc); VTNET_CORE_UNLOCK(sc); return (0); } static int vtnet_negotiate_features(struct vtnet_softc *sc) { device_t dev; uint64_t features, negotiated_features; int no_csum; dev = sc->vtnet_dev; features = virtio_bus_is_modern(dev) ? VTNET_MODERN_FEATURES : VTNET_LEGACY_FEATURES; /* * TSO and LRO are only available when their corresponding checksum * offload feature is also negotiated. */ no_csum = vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable); if (no_csum) features &= ~(VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM); if (no_csum || vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable)) features &= ~VTNET_TSO_FEATURES; if (no_csum || vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable)) features &= ~VTNET_LRO_FEATURES; #ifndef VTNET_LEGACY_TX if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable)) features &= ~VIRTIO_NET_F_MQ; #else features &= ~VIRTIO_NET_F_MQ; #endif negotiated_features = virtio_negotiate_features(dev, features); if (virtio_with_feature(dev, VIRTIO_NET_F_MTU)) { uint16_t mtu; mtu = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, mtu)); if (mtu < VTNET_MIN_MTU /* || mtu > VTNET_MAX_MTU */) { device_printf(dev, "Invalid MTU value: %d. " "MTU feature disabled.\n", mtu); features &= ~VIRTIO_NET_F_MTU; negotiated_features = virtio_negotiate_features(dev, features); } } if (virtio_with_feature(dev, VIRTIO_NET_F_MQ)) { uint16_t npairs; npairs = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, max_virtqueue_pairs)); if (npairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || npairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { device_printf(dev, "Invalid max_virtqueue_pairs value: " "%d. Multiqueue feature disabled.\n", npairs); features &= ~VIRTIO_NET_F_MQ; negotiated_features = virtio_negotiate_features(dev, features); } } if (virtio_with_feature(dev, VTNET_LRO_FEATURES) && virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0) { /* * LRO without mergeable buffers requires special care. This * is not ideal because every receive buffer must be large * enough to hold the maximum TCP packet, the Ethernet header, * and the header. This requires up to 34 descriptors with * MCLBYTES clusters. If we do not have indirect descriptors, * LRO is disabled since the virtqueue will not contain very * many receive buffers. */ if (!virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) { device_printf(dev, "Host LRO disabled since both mergeable buffers " "and indirect descriptors were not negotiated\n"); features &= ~VTNET_LRO_FEATURES; negotiated_features = virtio_negotiate_features(dev, features); } else sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG; } sc->vtnet_features = negotiated_features; sc->vtnet_negotiated_features = negotiated_features; return (virtio_finalize_features(dev)); } static int vtnet_setup_features(struct vtnet_softc *sc) { device_t dev; int error; dev = sc->vtnet_dev; error = vtnet_negotiate_features(sc); if (error) return (error); if (virtio_with_feature(dev, VIRTIO_F_VERSION_1)) sc->vtnet_flags |= VTNET_FLAG_MODERN; if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) sc->vtnet_flags |= VTNET_FLAG_INDIRECT; if (virtio_with_feature(dev, VIRTIO_RING_F_EVENT_IDX)) sc->vtnet_flags |= VTNET_FLAG_EVENT_IDX; if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) { /* This feature should always be negotiated. */ sc->vtnet_flags |= VTNET_FLAG_MAC; } if (virtio_with_feature(dev, VIRTIO_NET_F_MTU)) { sc->vtnet_max_mtu = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, mtu)); } else sc->vtnet_max_mtu = VTNET_MAX_MTU; if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) { sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS; sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); } else if (vtnet_modern(sc)) { /* This is identical to the mergeable header. */ sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_v1); } else sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr); if (vtnet_modern(sc) || sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) sc->vtnet_rx_nsegs = VTNET_RX_SEGS_HDR_INLINE; else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) sc->vtnet_rx_nsegs = VTNET_RX_SEGS_LRO_NOMRG; else sc->vtnet_rx_nsegs = VTNET_RX_SEGS_HDR_SEPARATE; /* * Favor "hardware" LRO if negotiated, but support software LRO as * a fallback; there is usually little benefit (or worse) with both. */ if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) == 0 && virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6) == 0) sc->vtnet_flags |= VTNET_FLAG_SW_LRO; if (virtio_with_feature(dev, VIRTIO_NET_F_GSO) || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4) || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) sc->vtnet_tx_nsegs = VTNET_TX_SEGS_MAX; else sc->vtnet_tx_nsegs = VTNET_TX_SEGS_MIN; sc->vtnet_req_vq_pairs = 1; sc->vtnet_max_vq_pairs = 1; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) { sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX)) sc->vtnet_flags |= VTNET_FLAG_CTRL_RX; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN)) sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER; if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR)) sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC; if (virtio_with_feature(dev, VIRTIO_NET_F_MQ)) { sc->vtnet_max_vq_pairs = virtio_read_dev_config_2(dev, offsetof(struct virtio_net_config, max_virtqueue_pairs)); } } if (sc->vtnet_max_vq_pairs > 1) { int req; /* * Limit the maximum number of requested queue pairs to the * number of CPUs and the configured maximum. */ req = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs); if (req < 0) req = 1; if (req == 0) req = mp_ncpus; if (req > sc->vtnet_max_vq_pairs) req = sc->vtnet_max_vq_pairs; if (req > mp_ncpus) req = mp_ncpus; if (req > 1) { sc->vtnet_req_vq_pairs = req; sc->vtnet_flags |= VTNET_FLAG_MQ; } } return (0); } static int vtnet_init_rxq(struct vtnet_softc *sc, int id) { struct vtnet_rxq *rxq; rxq = &sc->vtnet_rxqs[id]; snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d", device_get_nameunit(sc->vtnet_dev), id); mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF); rxq->vtnrx_sc = sc; rxq->vtnrx_id = id; rxq->vtnrx_sg = sglist_alloc(sc->vtnet_rx_nsegs, M_NOWAIT); if (rxq->vtnrx_sg == NULL) return (ENOMEM); #if defined(INET) || defined(INET6) if (vtnet_software_lro(sc)) { if (tcp_lro_init_args(&rxq->vtnrx_lro, sc->vtnet_ifp, sc->vtnet_lro_entry_count, sc->vtnet_lro_mbufq_depth) != 0) return (ENOMEM); } #endif NET_TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq); rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT, taskqueue_thread_enqueue, &rxq->vtnrx_tq); return (rxq->vtnrx_tq == NULL ? ENOMEM : 0); } static int vtnet_init_txq(struct vtnet_softc *sc, int id) { struct vtnet_txq *txq; txq = &sc->vtnet_txqs[id]; snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d", device_get_nameunit(sc->vtnet_dev), id); mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF); txq->vtntx_sc = sc; txq->vtntx_id = id; txq->vtntx_sg = sglist_alloc(sc->vtnet_tx_nsegs, M_NOWAIT); if (txq->vtntx_sg == NULL) return (ENOMEM); #ifndef VTNET_LEGACY_TX txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF, M_NOWAIT, &txq->vtntx_mtx); if (txq->vtntx_br == NULL) return (ENOMEM); TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq); #endif TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq); txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT, taskqueue_thread_enqueue, &txq->vtntx_tq); if (txq->vtntx_tq == NULL) return (ENOMEM); return (0); } static int vtnet_alloc_rxtx_queues(struct vtnet_softc *sc) { int i, npairs, error; npairs = sc->vtnet_max_vq_pairs; sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL) return (ENOMEM); for (i = 0; i < npairs; i++) { error = vtnet_init_rxq(sc, i); if (error) return (error); error = vtnet_init_txq(sc, i); if (error) return (error); } vtnet_set_rx_process_limit(sc); vtnet_setup_queue_sysctl(sc); return (0); } static void vtnet_destroy_rxq(struct vtnet_rxq *rxq) { rxq->vtnrx_sc = NULL; rxq->vtnrx_id = -1; #if defined(INET) || defined(INET6) tcp_lro_free(&rxq->vtnrx_lro); #endif if (rxq->vtnrx_sg != NULL) { sglist_free(rxq->vtnrx_sg); rxq->vtnrx_sg = NULL; } if (mtx_initialized(&rxq->vtnrx_mtx) != 0) mtx_destroy(&rxq->vtnrx_mtx); } static void vtnet_destroy_txq(struct vtnet_txq *txq) { txq->vtntx_sc = NULL; txq->vtntx_id = -1; if (txq->vtntx_sg != NULL) { sglist_free(txq->vtntx_sg); txq->vtntx_sg = NULL; } #ifndef VTNET_LEGACY_TX if (txq->vtntx_br != NULL) { buf_ring_free(txq->vtntx_br, M_DEVBUF); txq->vtntx_br = NULL; } #endif if (mtx_initialized(&txq->vtntx_mtx) != 0) mtx_destroy(&txq->vtntx_mtx); } static void vtnet_free_rxtx_queues(struct vtnet_softc *sc) { int i; if (sc->vtnet_rxqs != NULL) { for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_destroy_rxq(&sc->vtnet_rxqs[i]); free(sc->vtnet_rxqs, M_DEVBUF); sc->vtnet_rxqs = NULL; } if (sc->vtnet_txqs != NULL) { for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_destroy_txq(&sc->vtnet_txqs[i]); free(sc->vtnet_txqs, M_DEVBUF); sc->vtnet_txqs = NULL; } } static int vtnet_alloc_rx_filters(struct vtnet_softc *sc) { if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_mac_filter == NULL) return (ENOMEM); } if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) * VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtnet_vlan_filter == NULL) return (ENOMEM); } return (0); } static void vtnet_free_rx_filters(struct vtnet_softc *sc) { if (sc->vtnet_mac_filter != NULL) { free(sc->vtnet_mac_filter, M_DEVBUF); sc->vtnet_mac_filter = NULL; } if (sc->vtnet_vlan_filter != NULL) { free(sc->vtnet_vlan_filter, M_DEVBUF); sc->vtnet_vlan_filter = NULL; } } static int vtnet_alloc_virtqueues(struct vtnet_softc *sc) { device_t dev; struct vq_alloc_info *info; struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i, idx, flags, nvqs, error; dev = sc->vtnet_dev; flags = 0; nvqs = sc->vtnet_max_vq_pairs * 2; if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) nvqs++; info = malloc(sizeof(struct vq_alloc_info) * nvqs, M_TEMP, M_NOWAIT); if (info == NULL) return (ENOMEM); for (i = 0, idx = 0; i < sc->vtnet_req_vq_pairs; i++, idx += 2) { rxq = &sc->vtnet_rxqs[i]; VQ_ALLOC_INFO_INIT(&info[idx], sc->vtnet_rx_nsegs, vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq, "%s-rx%d", device_get_nameunit(dev), rxq->vtnrx_id); txq = &sc->vtnet_txqs[i]; VQ_ALLOC_INFO_INIT(&info[idx+1], sc->vtnet_tx_nsegs, vtnet_tx_vq_intr, txq, &txq->vtntx_vq, "%s-tx%d", device_get_nameunit(dev), txq->vtntx_id); } /* These queues will not be used so allocate the minimum resources. */ for (/**/; i < sc->vtnet_max_vq_pairs; i++, idx += 2) { rxq = &sc->vtnet_rxqs[i]; VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, rxq, &rxq->vtnrx_vq, "%s-rx%d", device_get_nameunit(dev), rxq->vtnrx_id); txq = &sc->vtnet_txqs[i]; VQ_ALLOC_INFO_INIT(&info[idx+1], 0, NULL, txq, &txq->vtntx_vq, "%s-tx%d", device_get_nameunit(dev), txq->vtntx_id); } if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL, &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev)); } /* * TODO: Enable interrupt binding if this is multiqueue. This will * only matter when per-virtqueue MSIX is available. */ if (sc->vtnet_flags & VTNET_FLAG_MQ) flags |= 0; error = virtio_alloc_virtqueues(dev, flags, nvqs, info); free(info, M_TEMP); return (error); } static int vtnet_alloc_interface(struct vtnet_softc *sc) { device_t dev; if_t ifp; dev = sc->vtnet_dev; ifp = if_alloc(IFT_ETHER); if (ifp == NULL) return (ENOMEM); sc->vtnet_ifp = ifp; if_setsoftc(ifp, sc); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); return (0); } static int vtnet_setup_interface(struct vtnet_softc *sc) { device_t dev; struct pfil_head_args pa; if_t ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); if_setbaudrate(ifp, IF_Gbps(10)); if_setinitfn(ifp, vtnet_init); if_setioctlfn(ifp, vtnet_ioctl); if_setgetcounterfn(ifp, vtnet_get_counter); #ifndef VTNET_LEGACY_TX if_settransmitfn(ifp, vtnet_txq_mq_start); if_setqflushfn(ifp, vtnet_qflush); #else struct virtqueue *vq = sc->vtnet_txqs[0].vtntx_vq; if_setstartfn(ifp, vtnet_start); if_setsendqlen(ifp, virtqueue_size(vq) - 1); if_setsendqready(ifp); #endif vtnet_get_macaddr(sc); if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS)) if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0); ifmedia_init(&sc->vtnet_media, 0, vtnet_ifmedia_upd, vtnet_ifmedia_sts); ifmedia_add(&sc->vtnet_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->vtnet_media, IFM_ETHER | IFM_AUTO); if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) { int gso; if_setcapabilitiesbit(ifp, IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6, 0); gso = virtio_with_feature(dev, VIRTIO_NET_F_GSO); if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4)) if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN)) sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; if (if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) { int tso_maxlen; if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTSO, 0); tso_maxlen = vtnet_tunable_int(sc, "tso_maxlen", vtnet_tso_maxlen); if_sethwtsomax(ifp, tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); if_sethwtsomaxsegcount(ifp, sc->vtnet_tx_nsegs - 1); if_sethwtsomaxsegsize(ifp, PAGE_SIZE); } } if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) { if_setcapabilitiesbit(ifp, IFCAP_RXCSUM, 0); #ifdef notyet /* BMV: Rx checksums not distinguished between IPv4 and IPv6. */ if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); #endif if (vtnet_tunable_int(sc, "fixup_needs_csum", vtnet_fixup_needs_csum) != 0) sc->vtnet_flags |= VTNET_FLAG_FIXUP_NEEDS_CSUM; /* Support either "hardware" or software LRO. */ if_setcapabilitiesbit(ifp, IFCAP_LRO, 0); } if (if_getcapabilities(ifp) & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6)) { /* * VirtIO does not support VLAN tagging, but we can fake * it by inserting and removing the 802.1Q header during * transmit and receive. We are then able to do checksum * offloading of VLAN frames. */ if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM, 0); } if (sc->vtnet_max_mtu >= ETHERMTU_JUMBO) if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0); if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0); /* * Capabilities after here are not enabled by default. */ if_setcapenable(ifp, if_getcapabilities(ifp)); if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0); sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); } ether_ifattach(ifp, sc->vtnet_hwaddr); /* Tell the upper layer(s) we support long frames. */ if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); DEBUGNET_SET(ifp, vtnet); pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; pa.pa_headname = if_name(ifp); sc->vtnet_pfil = pfil_head_register(&pa); return (0); } static int vtnet_rx_cluster_size(struct vtnet_softc *sc, int mtu) { int framesz; if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) return (MJUMPAGESIZE); else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) return (MCLBYTES); /* * Try to scale the receive mbuf cluster size from the MTU. We * could also use the VQ size to influence the selected size, * but that would only matter for very small queues. */ if (vtnet_modern(sc)) { MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr_v1)); framesz = sizeof(struct virtio_net_hdr_v1); } else framesz = sizeof(struct vtnet_rx_header); framesz += sizeof(struct ether_vlan_header) + mtu; if (framesz <= MCLBYTES) return (MCLBYTES); else if (framesz <= MJUMPAGESIZE) return (MJUMPAGESIZE); else if (framesz <= MJUM9BYTES) return (MJUM9BYTES); /* Sane default; avoid 16KB clusters. */ return (MCLBYTES); } static int vtnet_ioctl_mtu(struct vtnet_softc *sc, u_int mtu) { if_t ifp; int clustersz; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (if_getmtu(ifp) == mtu) return (0); else if (mtu < ETHERMIN || mtu > sc->vtnet_max_mtu) return (EINVAL); if_setmtu(ifp, mtu); clustersz = vtnet_rx_cluster_size(sc, mtu); if (clustersz != sc->vtnet_rx_clustersz && if_getdrvflags(ifp) & IFF_DRV_RUNNING) { if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); vtnet_init_locked(sc, 0); } return (0); } static int vtnet_ioctl_ifflags(struct vtnet_softc *sc) { if_t ifp; int drv_running; ifp = sc->vtnet_ifp; drv_running = (if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0; VTNET_CORE_LOCK_ASSERT(sc); if ((if_getflags(ifp) & IFF_UP) == 0) { if (drv_running) vtnet_stop(sc); goto out; } if (!drv_running) { vtnet_init_locked(sc, 0); goto out; } if ((if_getflags(ifp) ^ sc->vtnet_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) vtnet_rx_filter(sc); else { if ((if_getflags(ifp) ^ sc->vtnet_if_flags) & IFF_ALLMULTI) return (ENOTSUP); if_setflagbits(ifp, IFF_PROMISC, 0); } } out: sc->vtnet_if_flags = if_getflags(ifp); return (0); } static int vtnet_ioctl_multi(struct vtnet_softc *sc) { if_t ifp; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX && if_getdrvflags(ifp) & IFF_DRV_RUNNING) vtnet_rx_filter_mac(sc); return (0); } static int vtnet_ioctl_ifcap(struct vtnet_softc *sc, struct ifreq *ifr) { if_t ifp; int mask, reinit, update; ifp = sc->vtnet_ifp; mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ if_getcapenable(ifp); reinit = update = 0; VTNET_CORE_LOCK_ASSERT(sc); if (mask & IFCAP_TXCSUM) if_togglecapenable(ifp, IFCAP_TXCSUM); if (mask & IFCAP_TXCSUM_IPV6) if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); if (mask & IFCAP_TSO4) if_togglecapenable(ifp, IFCAP_TSO4); if (mask & IFCAP_TSO6) if_togglecapenable(ifp, IFCAP_TSO6); if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO)) { /* * These Rx features require the negotiated features to * be updated. Avoid a full reinit if possible. */ if (sc->vtnet_features & VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) update = 1; else reinit = 1; /* BMV: Avoid needless renegotiation for just software LRO. */ if ((mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO)) == IFCAP_LRO && vtnet_software_lro(sc)) reinit = update = 0; if (mask & IFCAP_RXCSUM) if_togglecapenable(ifp, IFCAP_RXCSUM); if (mask & IFCAP_RXCSUM_IPV6) if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); if (mask & IFCAP_LRO) if_togglecapenable(ifp, IFCAP_LRO); /* * VirtIO does not distinguish between IPv4 and IPv6 checksums * so treat them as a pair. Guest TSO (LRO) requires receive * checksums. */ if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) { if_setcapenablebit(ifp, IFCAP_RXCSUM, 0); #ifdef notyet if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0); #endif } else if_setcapenablebit(ifp, 0, (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO)); } if (mask & IFCAP_VLAN_HWFILTER) { /* These Rx features require renegotiation. */ reinit = 1; if (mask & IFCAP_VLAN_HWFILTER) if_togglecapenable(ifp, IFCAP_VLAN_HWFILTER); } if (mask & IFCAP_VLAN_HWTSO) if_togglecapenable(ifp, IFCAP_VLAN_HWTSO); if (mask & IFCAP_VLAN_HWTAGGING) if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING); if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { if (reinit) { if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); vtnet_init_locked(sc, 0); } else if (update) vtnet_update_rx_offloads(sc); } return (0); } static int vtnet_ioctl(if_t ifp, u_long cmd, caddr_t data) { struct vtnet_softc *sc; struct ifreq *ifr; int error; sc = if_getsoftc(ifp); ifr = (struct ifreq *) data; error = 0; switch (cmd) { case SIOCSIFMTU: VTNET_CORE_LOCK(sc); error = vtnet_ioctl_mtu(sc, ifr->ifr_mtu); VTNET_CORE_UNLOCK(sc); break; case SIOCSIFFLAGS: VTNET_CORE_LOCK(sc); error = vtnet_ioctl_ifflags(sc); VTNET_CORE_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: VTNET_CORE_LOCK(sc); error = vtnet_ioctl_multi(sc); VTNET_CORE_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd); break; case SIOCSIFCAP: VTNET_CORE_LOCK(sc); error = vtnet_ioctl_ifcap(sc, ifr); VTNET_CORE_UNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, cmd, data); break; } VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc); return (error); } static int vtnet_rxq_populate(struct vtnet_rxq *rxq) { struct virtqueue *vq; int nbufs, error; #ifdef DEV_NETMAP error = vtnet_netmap_rxq_populate(rxq); if (error >= 0) return (error); #endif /* DEV_NETMAP */ vq = rxq->vtnrx_vq; error = ENOSPC; for (nbufs = 0; !virtqueue_full(vq); nbufs++) { error = vtnet_rxq_new_buf(rxq); if (error) break; } if (nbufs > 0) { virtqueue_notify(vq); /* * EMSGSIZE signifies the virtqueue did not have enough * entries available to hold the last mbuf. This is not * an error. */ if (error == EMSGSIZE) error = 0; } return (error); } static void vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq) { struct virtqueue *vq; struct mbuf *m; int last; #ifdef DEV_NETMAP struct netmap_kring *kring = netmap_kring_on(NA(rxq->vtnrx_sc->vtnet_ifp), rxq->vtnrx_id, NR_RX); #else /* !DEV_NETMAP */ void *kring = NULL; #endif /* !DEV_NETMAP */ vq = rxq->vtnrx_vq; last = 0; while ((m = virtqueue_drain(vq, &last)) != NULL) { if (kring == NULL) m_freem(m); } KASSERT(virtqueue_empty(vq), ("%s: mbufs remaining in rx queue %p", __func__, rxq)); } static struct mbuf * vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp) { struct mbuf *m_head, *m_tail, *m; int i, size; m_head = NULL; size = sc->vtnet_rx_clustersz; KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, ("%s: mbuf %d chain requested without LRO_NOMRG", __func__, nbufs)); for (i = 0; i < nbufs; i++) { m = m_getjcl(M_NOWAIT, MT_DATA, i == 0 ? M_PKTHDR : 0, size); if (m == NULL) { sc->vtnet_stats.mbuf_alloc_failed++; m_freem(m_head); return (NULL); } m->m_len = size; if (m_head != NULL) { m_tail->m_next = m; m_tail = m; } else m_head = m_tail = m; } if (m_tailp != NULL) *m_tailp = m_tail; return (m_head); } /* * Slow path for when LRO without mergeable buffers is negotiated. */ static int vtnet_rxq_replace_lro_nomrg_buf(struct vtnet_rxq *rxq, struct mbuf *m0, int len0) { struct vtnet_softc *sc; struct mbuf *m, *m_prev, *m_new, *m_tail; int len, clustersz, nreplace, error; sc = rxq->vtnrx_sc; clustersz = sc->vtnet_rx_clustersz; m_prev = NULL; m_tail = NULL; nreplace = 0; m = m0; len = len0; /* * Since these mbuf chains are so large, avoid allocating a complete * replacement when the received frame did not consume the entire * chain. Unused mbufs are moved to the tail of the replacement mbuf. */ while (len > 0) { if (m == NULL) { sc->vtnet_stats.rx_frame_too_large++; return (EMSGSIZE); } /* * Every mbuf should have the expected cluster size since that * is also used to allocate the replacements. */ KASSERT(m->m_len == clustersz, ("%s: mbuf size %d not expected cluster size %d", __func__, m->m_len, clustersz)); m->m_len = MIN(m->m_len, len); len -= m->m_len; m_prev = m; m = m->m_next; nreplace++; } KASSERT(nreplace > 0 && nreplace <= sc->vtnet_rx_nmbufs, ("%s: invalid replacement mbuf count %d max %d", __func__, nreplace, sc->vtnet_rx_nmbufs)); m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail); if (m_new == NULL) { m_prev->m_len = clustersz; return (ENOBUFS); } /* * Move any unused mbufs from the received mbuf chain onto the * end of the replacement chain. */ if (m_prev->m_next != NULL) { m_tail->m_next = m_prev->m_next; m_prev->m_next = NULL; } error = vtnet_rxq_enqueue_buf(rxq, m_new); if (error) { /* * The replacement is suppose to be an copy of the one * dequeued so this is a very unexpected error. * * Restore the m0 chain to the original state if it was * modified so we can then discard it. */ if (m_tail->m_next != NULL) { m_prev->m_next = m_tail->m_next; m_tail->m_next = NULL; } m_prev->m_len = clustersz; sc->vtnet_stats.rx_enq_replacement_failed++; m_freem(m_new); } return (error); } static int vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len) { struct vtnet_softc *sc; struct mbuf *m_new; int error; sc = rxq->vtnrx_sc; if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) return (vtnet_rxq_replace_lro_nomrg_buf(rxq, m, len)); MPASS(m->m_next == NULL); if (m->m_len < len) return (EMSGSIZE); m_new = vtnet_rx_alloc_buf(sc, 1, NULL); if (m_new == NULL) return (ENOBUFS); error = vtnet_rxq_enqueue_buf(rxq, m_new); if (error) { sc->vtnet_stats.rx_enq_replacement_failed++; m_freem(m_new); } else m->m_len = len; return (error); } static int vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m) { struct vtnet_softc *sc; struct sglist *sg; int header_inlined, error; sc = rxq->vtnrx_sc; sg = rxq->vtnrx_sg; KASSERT(m->m_next == NULL || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, ("%s: mbuf chain without LRO_NOMRG", __func__)); VTNET_RXQ_LOCK_ASSERT(rxq); sglist_reset(sg); header_inlined = vtnet_modern(sc) || (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) != 0; /* TODO: ANY_LAYOUT */ if (header_inlined) error = sglist_append_mbuf(sg, m); else { struct vtnet_rx_header *rxhdr = mtod(m, struct vtnet_rx_header *); MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr)); /* Append the header and remaining mbuf data. */ error = sglist_append(sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size); if (error) return (error); error = sglist_append(sg, &rxhdr[1], m->m_len - sizeof(struct vtnet_rx_header)); if (error) return (error); if (m->m_next != NULL) error = sglist_append_mbuf(sg, m->m_next); } if (error) return (error); return (virtqueue_enqueue(rxq->vtnrx_vq, m, sg, 0, sg->sg_nseg)); } static int vtnet_rxq_new_buf(struct vtnet_rxq *rxq) { struct vtnet_softc *sc; struct mbuf *m; int error; sc = rxq->vtnrx_sc; m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL); if (m == NULL) return (ENOBUFS); error = vtnet_rxq_enqueue_buf(rxq, m); if (error) m_freem(m); return (error); } static int vtnet_rxq_csum_needs_csum(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t etype, int hoff, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; int error; sc = rxq->vtnrx_sc; /* * NEEDS_CSUM corresponds to Linux's CHECKSUM_PARTIAL, but FreeBSD does * not have an analogous CSUM flag. The checksum has been validated, * but is incomplete (TCP/UDP pseudo header). * * The packet is likely from another VM on the same host that itself * performed checksum offloading so Tx/Rx is basically a memcpy and * the checksum has little value. * * Default to receiving the packet as-is for performance reasons, but * this can cause issues if the packet is to be forwarded because it * does not contain a valid checksum. This patch may be helpful: * https://reviews.freebsd.org/D6611. In the meantime, have the driver * compute the checksum if requested. * * BMV: Need to add an CSUM_PARTIAL flag? */ if ((sc->vtnet_flags & VTNET_FLAG_FIXUP_NEEDS_CSUM) == 0) { error = vtnet_rxq_csum_data_valid(rxq, m, etype, hoff, hdr); return (error); } /* * Compute the checksum in the driver so the packet will contain a * valid checksum. The checksum is at csum_offset from csum_start. */ switch (etype) { #if defined(INET) || defined(INET6) case ETHERTYPE_IP: case ETHERTYPE_IPV6: { int csum_off, csum_end; uint16_t csum; csum_off = hdr->csum_start + hdr->csum_offset; csum_end = csum_off + sizeof(uint16_t); /* Assume checksum will be in the first mbuf. */ if (m->m_len < csum_end || m->m_pkthdr.len < csum_end) return (1); /* * Like in_delayed_cksum()/in6_delayed_cksum(), compute the * checksum and write it at the specified offset. We could * try to verify the packet: csum_start should probably * correspond to the start of the TCP/UDP header. * * BMV: Need to properly handle UDP with zero checksum. Is * the IPv4 header checksum implicitly validated? */ csum = in_cksum_skip(m, m->m_pkthdr.len, hdr->csum_start); *(uint16_t *)(mtodo(m, csum_off)) = csum; m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; } #endif default: sc->vtnet_stats.rx_csum_bad_ethtype++; return (1); } return (0); } static int vtnet_rxq_csum_data_valid(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t etype, int hoff, struct virtio_net_hdr *hdr __unused) { #if 0 struct vtnet_softc *sc; #endif int protocol; #if 0 sc = rxq->vtnrx_sc; #endif switch (etype) { #if defined(INET) case ETHERTYPE_IP: if (__predict_false(m->m_len < hoff + sizeof(struct ip))) protocol = IPPROTO_DONE; else { struct ip *ip = (struct ip *)(m->m_data + hoff); protocol = ip->ip_p; } break; #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(m->m_len < hoff + sizeof(struct ip6_hdr)) || ip6_lasthdr(m, hoff, IPPROTO_IPV6, &protocol) < 0) protocol = IPPROTO_DONE; break; #endif default: protocol = IPPROTO_DONE; break; } switch (protocol) { case IPPROTO_TCP: case IPPROTO_UDP: m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; default: /* * FreeBSD does not support checksum offloading of this * protocol. Let the stack re-verify the checksum later * if the protocol is supported. */ #if 0 if_printf(sc->vtnet_ifp, "%s: checksum offload of unsupported protocol " "etype=%#x protocol=%d csum_start=%d csum_offset=%d\n", __func__, etype, protocol, hdr->csum_start, hdr->csum_offset); #endif break; } return (0); } static int vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m, struct virtio_net_hdr *hdr) { const struct ether_header *eh; int hoff; uint16_t etype; eh = mtod(m, const struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { /* TODO BMV: Handle QinQ. */ const struct ether_vlan_header *evh = mtod(m, const struct ether_vlan_header *); etype = ntohs(evh->evl_proto); hoff = sizeof(struct ether_vlan_header); } else hoff = sizeof(struct ether_header); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) return (vtnet_rxq_csum_needs_csum(rxq, m, etype, hoff, hdr)); else /* VIRTIO_NET_HDR_F_DATA_VALID */ return (vtnet_rxq_csum_data_valid(rxq, m, etype, hoff, hdr)); } static void vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs) { struct mbuf *m; while (--nbufs > 0) { m = virtqueue_dequeue(rxq->vtnrx_vq, NULL); if (m == NULL) break; vtnet_rxq_discard_buf(rxq, m); } } static void vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m) { int error __diagused; /* * Requeue the discarded mbuf. This should always be successful * since it was just dequeued. */ error = vtnet_rxq_enqueue_buf(rxq, m); KASSERT(error == 0, ("%s: cannot requeue discarded mbuf %d", __func__, error)); } static int vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs) { struct vtnet_softc *sc; struct virtqueue *vq; struct mbuf *m_tail; sc = rxq->vtnrx_sc; vq = rxq->vtnrx_vq; m_tail = m_head; while (--nbufs > 0) { struct mbuf *m; uint32_t len; m = virtqueue_dequeue(vq, &len); if (m == NULL) { rxq->vtnrx_stats.vrxs_ierrors++; goto fail; } if (vtnet_rxq_new_buf(rxq) != 0) { rxq->vtnrx_stats.vrxs_iqdrops++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); goto fail; } if (m->m_len < len) len = m->m_len; m->m_len = len; m->m_flags &= ~M_PKTHDR; m_head->m_pkthdr.len += len; m_tail->m_next = m; m_tail = m; } return (0); fail: sc->vtnet_stats.rx_mergeable_failed++; m_freem(m_head); return (1); } #if defined(INET) || defined(INET6) static int vtnet_lro_rx(struct vtnet_rxq *rxq, struct mbuf *m) { struct lro_ctrl *lro; lro = &rxq->vtnrx_lro; if (lro->lro_mbuf_max != 0) { tcp_lro_queue_mbuf(lro, m); return (0); } return (tcp_lro_rx(lro, m, 0)); } #endif static void vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; if_t ifp; sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; if (if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING) { struct ether_header *eh = mtod(m, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { vtnet_vlan_tag_remove(m); /* * With the 802.1Q header removed, update the * checksum starting location accordingly. */ if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) hdr->csum_start -= ETHER_VLAN_ENCAP_LEN; } } m->m_pkthdr.flowid = rxq->vtnrx_id; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM | VIRTIO_NET_HDR_F_DATA_VALID)) { if (vtnet_rxq_csum(rxq, m, hdr) == 0) rxq->vtnrx_stats.vrxs_csum++; else rxq->vtnrx_stats.vrxs_csum_failed++; } if (hdr->gso_size != 0) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_TCPV6: m->m_pkthdr.lro_nsegs = howmany(m->m_pkthdr.len, hdr->gso_size); rxq->vtnrx_stats.vrxs_host_lro++; break; } } rxq->vtnrx_stats.vrxs_ipackets++; rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len; #if defined(INET) || defined(INET6) if (vtnet_software_lro(sc) && if_getcapenable(ifp) & IFCAP_LRO) { if (vtnet_lro_rx(rxq, m) == 0) return; } #endif if_input(ifp, m); } static int vtnet_rxq_eof(struct vtnet_rxq *rxq) { struct virtio_net_hdr lhdr, *hdr; struct vtnet_softc *sc; if_t ifp; struct virtqueue *vq; int deq, count; sc = rxq->vtnrx_sc; vq = rxq->vtnrx_vq; ifp = sc->vtnet_ifp; deq = 0; count = sc->vtnet_rx_process_limit; VTNET_RXQ_LOCK_ASSERT(rxq); while (count-- > 0) { struct mbuf *m; uint32_t len, nbufs, adjsz; m = virtqueue_dequeue(vq, &len); if (m == NULL) break; deq++; if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) { rxq->vtnrx_stats.vrxs_ierrors++; vtnet_rxq_discard_buf(rxq, m); continue; } if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) { struct virtio_net_hdr_mrg_rxbuf *mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *); kmsan_mark(mhdr, sizeof(*mhdr), KMSAN_STATE_INITED); nbufs = vtnet_htog16(sc, mhdr->num_buffers); adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf); } else if (vtnet_modern(sc)) { nbufs = 1; /* num_buffers is always 1 */ adjsz = sizeof(struct virtio_net_hdr_v1); } else { nbufs = 1; adjsz = sizeof(struct vtnet_rx_header); /* * Account for our gap between the header and start of * data to keep the segments separated. */ len += VTNET_RX_HEADER_PAD; } if (vtnet_rxq_replace_buf(rxq, m, len) != 0) { rxq->vtnrx_stats.vrxs_iqdrops++; vtnet_rxq_discard_buf(rxq, m); if (nbufs > 1) vtnet_rxq_discard_merged_bufs(rxq, nbufs); continue; } m->m_pkthdr.len = len; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags = 0; if (nbufs > 1) { /* Dequeue the rest of chain. */ if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0) continue; } kmsan_mark_mbuf(m, KMSAN_STATE_INITED); /* * Save an endian swapped version of the header prior to it * being stripped. The header is always at the start of the * mbuf data. num_buffers was already saved (and not needed) * so use the standard header. */ hdr = mtod(m, struct virtio_net_hdr *); lhdr.flags = hdr->flags; lhdr.gso_type = hdr->gso_type; lhdr.hdr_len = vtnet_htog16(sc, hdr->hdr_len); lhdr.gso_size = vtnet_htog16(sc, hdr->gso_size); lhdr.csum_start = vtnet_htog16(sc, hdr->csum_start); lhdr.csum_offset = vtnet_htog16(sc, hdr->csum_offset); m_adj(m, adjsz); if (PFIL_HOOKED_IN(sc->vtnet_pfil)) { pfil_return_t pfil; - pfil = pfil_run_hooks(sc->vtnet_pfil, &m, ifp, PFIL_IN, - NULL); + pfil = pfil_mbuf_in(sc->vtnet_pfil, &m, ifp, NULL); switch (pfil) { - case PFIL_REALLOCED: - m = pfil_mem2mbuf(m->m_data); - break; case PFIL_DROPPED: case PFIL_CONSUMED: continue; default: KASSERT(pfil == PFIL_PASS, ("Filter returned %d!", pfil)); } } vtnet_rxq_input(rxq, m, &lhdr); } if (deq > 0) { #if defined(INET) || defined(INET6) if (vtnet_software_lro(sc)) tcp_lro_flush_all(&rxq->vtnrx_lro); #endif virtqueue_notify(vq); } return (count > 0 ? 0 : EAGAIN); } static void vtnet_rx_vq_process(struct vtnet_rxq *rxq, int tries) { struct vtnet_softc *sc; if_t ifp; u_int more; #ifdef DEV_NETMAP int nmirq; #endif /* DEV_NETMAP */ sc = rxq->vtnrx_sc; ifp = sc->vtnet_ifp; if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) { /* * Ignore this interrupt. Either this is a spurious interrupt * or multiqueue without per-VQ MSIX so every queue needs to * be polled (a brain dead configuration we could try harder * to avoid). */ vtnet_rxq_disable_intr(rxq); return; } VTNET_RXQ_LOCK(rxq); #ifdef DEV_NETMAP /* * We call netmap_rx_irq() under lock to prevent concurrent calls. * This is not necessary to serialize the access to the RX vq, but * rather to avoid races that may happen if this interface is * attached to a VALE switch, which would cause received packets * to stall in the RX queue (nm_kr_tryget() could find the kring * busy when called from netmap_bwrap_intr_notify()). */ nmirq = netmap_rx_irq(ifp, rxq->vtnrx_id, &more); if (nmirq != NM_IRQ_PASS) { VTNET_RXQ_UNLOCK(rxq); if (nmirq == NM_IRQ_RESCHED) { taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } return; } #endif /* DEV_NETMAP */ again: if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { VTNET_RXQ_UNLOCK(rxq); return; } more = vtnet_rxq_eof(rxq); if (more || vtnet_rxq_enable_intr(rxq) != 0) { if (!more) vtnet_rxq_disable_intr(rxq); /* * This is an occasional condition or race (when !more), * so retry a few times before scheduling the taskqueue. */ if (tries-- > 0) goto again; rxq->vtnrx_stats.vrxs_rescheduled++; VTNET_RXQ_UNLOCK(rxq); taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } else VTNET_RXQ_UNLOCK(rxq); } static void vtnet_rx_vq_intr(void *xrxq) { struct vtnet_rxq *rxq; rxq = xrxq; vtnet_rx_vq_process(rxq, VTNET_INTR_DISABLE_RETRIES); } static void vtnet_rxq_tq_intr(void *xrxq, int pending __unused) { struct vtnet_rxq *rxq; rxq = xrxq; vtnet_rx_vq_process(rxq, 0); } static int vtnet_txq_intr_threshold(struct vtnet_txq *txq) { struct vtnet_softc *sc; int threshold; sc = txq->vtntx_sc; /* * The Tx interrupt is disabled until the queue free count falls * below our threshold. Completed frames are drained from the Tx * virtqueue before transmitting new frames and in the watchdog * callout, so the frequency of Tx interrupts is greatly reduced, * at the cost of not freeing mbufs as quickly as they otherwise * would be. */ threshold = virtqueue_size(txq->vtntx_vq) / 4; /* * Without indirect descriptors, leave enough room for the most * segments we handle. */ if ((sc->vtnet_flags & VTNET_FLAG_INDIRECT) == 0 && threshold < sc->vtnet_tx_nsegs) threshold = sc->vtnet_tx_nsegs; return (threshold); } static int vtnet_txq_below_threshold(struct vtnet_txq *txq) { struct virtqueue *vq; vq = txq->vtntx_vq; return (virtqueue_nfree(vq) <= txq->vtntx_intr_threshold); } static int vtnet_txq_notify(struct vtnet_txq *txq) { struct virtqueue *vq; vq = txq->vtntx_vq; txq->vtntx_watchdog = VTNET_TX_TIMEOUT; virtqueue_notify(vq); if (vtnet_txq_enable_intr(txq) == 0) return (0); /* * Drain frames that were completed since last checked. If this * causes the queue to go above the threshold, the caller should * continue transmitting. */ if (vtnet_txq_eof(txq) != 0 && vtnet_txq_below_threshold(txq) == 0) { virtqueue_disable_intr(vq); return (1); } return (0); } static void vtnet_txq_free_mbufs(struct vtnet_txq *txq) { struct virtqueue *vq; struct vtnet_tx_header *txhdr; int last; #ifdef DEV_NETMAP struct netmap_kring *kring = netmap_kring_on(NA(txq->vtntx_sc->vtnet_ifp), txq->vtntx_id, NR_TX); #else /* !DEV_NETMAP */ void *kring = NULL; #endif /* !DEV_NETMAP */ vq = txq->vtntx_vq; last = 0; while ((txhdr = virtqueue_drain(vq, &last)) != NULL) { if (kring == NULL) { m_freem(txhdr->vth_mbuf); uma_zfree(vtnet_tx_header_zone, txhdr); } } KASSERT(virtqueue_empty(vq), ("%s: mbufs remaining in tx queue %p", __func__, txq)); } /* * BMV: This can go away once we finally have offsets in the mbuf header. */ static int vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m, int *etype, int *proto, int *start) { struct vtnet_softc *sc; struct ether_vlan_header *evh; #if defined(INET) || defined(INET6) int offset; #endif sc = txq->vtntx_sc; evh = mtod(m, struct ether_vlan_header *); if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { /* BMV: We should handle nested VLAN tags too. */ *etype = ntohs(evh->evl_proto); #if defined(INET) || defined(INET6) offset = sizeof(struct ether_vlan_header); #endif } else { *etype = ntohs(evh->evl_encap_proto); #if defined(INET) || defined(INET6) offset = sizeof(struct ether_header); #endif } switch (*etype) { #if defined(INET) case ETHERTYPE_IP: { struct ip *ip, iphdr; if (__predict_false(m->m_len < offset + sizeof(struct ip))) { m_copydata(m, offset, sizeof(struct ip), (caddr_t) &iphdr); ip = &iphdr; } else ip = (struct ip *)(m->m_data + offset); *proto = ip->ip_p; *start = offset + (ip->ip_hl << 2); break; } #endif #if defined(INET6) case ETHERTYPE_IPV6: *proto = -1; *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); /* Assert the network stack sent us a valid packet. */ KASSERT(*start > offset, ("%s: mbuf %p start %d offset %d proto %d", __func__, m, *start, offset, *proto)); break; #endif default: sc->vtnet_stats.tx_csum_unknown_ethtype++; return (EINVAL); } return (0); } static int vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type, int offset, struct virtio_net_hdr *hdr) { static struct timeval lastecn; static int curecn; struct vtnet_softc *sc; struct tcphdr *tcp, tcphdr; sc = txq->vtntx_sc; if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); tcp = &tcphdr; } else tcp = (struct tcphdr *)(m->m_data + offset); hdr->hdr_len = vtnet_gtoh16(sc, offset + (tcp->th_off << 2)); hdr->gso_size = vtnet_gtoh16(sc, m->m_pkthdr.tso_segsz); hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : VIRTIO_NET_HDR_GSO_TCPV6; if (__predict_false(tcp->th_flags & TH_CWR)) { /* * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In * FreeBSD, ECN support is not on a per-interface basis, * but globally via the net.inet.tcp.ecn.enable sysctl * knob. The default is off. */ if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { if (ppsratecheck(&lastecn, &curecn, 1)) if_printf(sc->vtnet_ifp, "TSO with ECN not negotiated with host\n"); return (ENOTSUP); } hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } txq->vtntx_stats.vtxs_tso++; return (0); } static struct mbuf * vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m, struct virtio_net_hdr *hdr) { struct vtnet_softc *sc; int flags, etype, csum_start, proto, error; sc = txq->vtntx_sc; flags = m->m_pkthdr.csum_flags; error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start); if (error) goto drop; if (flags & (VTNET_CSUM_OFFLOAD | VTNET_CSUM_OFFLOAD_IPV6)) { /* Sanity check the parsed mbuf matches the offload flags. */ if (__predict_false((flags & VTNET_CSUM_OFFLOAD && etype != ETHERTYPE_IP) || (flags & VTNET_CSUM_OFFLOAD_IPV6 && etype != ETHERTYPE_IPV6))) { sc->vtnet_stats.tx_csum_proto_mismatch++; goto drop; } hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = vtnet_gtoh16(sc, csum_start); hdr->csum_offset = vtnet_gtoh16(sc, m->m_pkthdr.csum_data); txq->vtntx_stats.vtxs_csum++; } if (flags & (CSUM_IP_TSO | CSUM_IP6_TSO)) { /* * Sanity check the parsed mbuf IP protocol is TCP, and * VirtIO TSO reqires the checksum offloading above. */ if (__predict_false(proto != IPPROTO_TCP)) { sc->vtnet_stats.tx_tso_not_tcp++; goto drop; } else if (__predict_false((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) == 0)) { sc->vtnet_stats.tx_tso_without_csum++; goto drop; } error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr); if (error) goto drop; } return (m); drop: m_freem(m); return (NULL); } static int vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head, struct vtnet_tx_header *txhdr) { struct vtnet_softc *sc; struct virtqueue *vq; struct sglist *sg; struct mbuf *m; int error; sc = txq->vtntx_sc; vq = txq->vtntx_vq; sg = txq->vtntx_sg; m = *m_head; sglist_reset(sg); error = sglist_append(sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size); if (error != 0 || sg->sg_nseg != 1) { KASSERT(0, ("%s: cannot add header to sglist error %d nseg %d", __func__, error, sg->sg_nseg)); goto fail; } error = sglist_append_mbuf(sg, m); if (error) { m = m_defrag(m, M_NOWAIT); if (m == NULL) goto fail; *m_head = m; sc->vtnet_stats.tx_defragged++; error = sglist_append_mbuf(sg, m); if (error) goto fail; } txhdr->vth_mbuf = m; error = virtqueue_enqueue(vq, txhdr, sg, sg->sg_nseg, 0); return (error); fail: sc->vtnet_stats.tx_defrag_failed++; m_freem(*m_head); *m_head = NULL; return (ENOBUFS); } static int vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head, int flags) { struct vtnet_tx_header *txhdr; struct virtio_net_hdr *hdr; struct mbuf *m; int error; m = *m_head; M_ASSERTPKTHDR(m); txhdr = uma_zalloc(vtnet_tx_header_zone, flags | M_ZERO); if (txhdr == NULL) { m_freem(m); *m_head = NULL; return (ENOMEM); } /* * Always use the non-mergeable header, regardless if mergable headers * were negotiated, because for transmit num_buffers is always zero. * The vtnet_hdr_size is used to enqueue the right header size segment. */ hdr = &txhdr->vth_uhdr.hdr; if (m->m_flags & M_VLANTAG) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if ((*m_head = m) == NULL) { error = ENOBUFS; goto fail; } m->m_flags &= ~M_VLANTAG; } if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) { m = vtnet_txq_offload(txq, m, hdr); if ((*m_head = m) == NULL) { error = ENOBUFS; goto fail; } } error = vtnet_txq_enqueue_buf(txq, m_head, txhdr); fail: if (error) uma_zfree(vtnet_tx_header_zone, txhdr); return (error); } #ifdef VTNET_LEGACY_TX static void vtnet_start_locked(struct vtnet_txq *txq, if_t ifp) { struct vtnet_softc *sc; struct virtqueue *vq; struct mbuf *m0; int tries, enq; sc = txq->vtntx_sc; vq = txq->vtntx_vq; tries = 0; VTNET_TXQ_LOCK_ASSERT(txq); if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || sc->vtnet_link_active == 0) return; vtnet_txq_eof(txq); again: enq = 0; while (!if_sendq_empty(ifp)) { if (virtqueue_full(vq)) break; m0 = if_dequeue(ifp); if (m0 == NULL) break; if (vtnet_txq_encap(txq, &m0, M_NOWAIT) != 0) { if (m0 != NULL) if_sendq_prepend(ifp, m0); break; } enq++; ETHER_BPF_MTAP(ifp, m0); } if (enq > 0 && vtnet_txq_notify(txq) != 0) { if (tries++ < VTNET_NOTIFY_RETRIES) goto again; txq->vtntx_stats.vtxs_rescheduled++; taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); } } static void vtnet_start(if_t ifp) { struct vtnet_softc *sc; struct vtnet_txq *txq; sc = if_getsoftc(ifp); txq = &sc->vtnet_txqs[0]; VTNET_TXQ_LOCK(txq); vtnet_start_locked(txq, ifp); VTNET_TXQ_UNLOCK(txq); } #else /* !VTNET_LEGACY_TX */ static int vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m) { struct vtnet_softc *sc; struct virtqueue *vq; struct buf_ring *br; if_t ifp; int enq, tries, error; sc = txq->vtntx_sc; vq = txq->vtntx_vq; br = txq->vtntx_br; ifp = sc->vtnet_ifp; tries = 0; error = 0; VTNET_TXQ_LOCK_ASSERT(txq); if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || sc->vtnet_link_active == 0) { if (m != NULL) error = drbr_enqueue(ifp, br, m); return (error); } if (m != NULL) { error = drbr_enqueue(ifp, br, m); if (error) return (error); } vtnet_txq_eof(txq); again: enq = 0; while ((m = drbr_peek(ifp, br)) != NULL) { if (virtqueue_full(vq)) { drbr_putback(ifp, br, m); break; } if (vtnet_txq_encap(txq, &m, M_NOWAIT) != 0) { if (m != NULL) drbr_putback(ifp, br, m); else drbr_advance(ifp, br); break; } drbr_advance(ifp, br); enq++; ETHER_BPF_MTAP(ifp, m); } if (enq > 0 && vtnet_txq_notify(txq) != 0) { if (tries++ < VTNET_NOTIFY_RETRIES) goto again; txq->vtntx_stats.vtxs_rescheduled++; taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask); } return (0); } static int vtnet_txq_mq_start(if_t ifp, struct mbuf *m) { struct vtnet_softc *sc; struct vtnet_txq *txq; int i, npairs, error; sc = if_getsoftc(ifp); npairs = sc->vtnet_act_vq_pairs; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % npairs; else i = curcpu % npairs; txq = &sc->vtnet_txqs[i]; if (VTNET_TXQ_TRYLOCK(txq) != 0) { error = vtnet_txq_mq_start_locked(txq, m); VTNET_TXQ_UNLOCK(txq); } else { error = drbr_enqueue(ifp, txq->vtntx_br, m); taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask); } return (error); } static void vtnet_txq_tq_deferred(void *xtxq, int pending __unused) { struct vtnet_softc *sc; struct vtnet_txq *txq; txq = xtxq; sc = txq->vtntx_sc; VTNET_TXQ_LOCK(txq); if (!drbr_empty(sc->vtnet_ifp, txq->vtntx_br)) vtnet_txq_mq_start_locked(txq, NULL); VTNET_TXQ_UNLOCK(txq); } #endif /* VTNET_LEGACY_TX */ static void vtnet_txq_start(struct vtnet_txq *txq) { struct vtnet_softc *sc; if_t ifp; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; #ifdef VTNET_LEGACY_TX if (!if_sendq_empty(ifp)) vtnet_start_locked(txq, ifp); #else if (!drbr_empty(ifp, txq->vtntx_br)) vtnet_txq_mq_start_locked(txq, NULL); #endif } static void vtnet_txq_tq_intr(void *xtxq, int pending __unused) { struct vtnet_softc *sc; struct vtnet_txq *txq; if_t ifp; txq = xtxq; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; VTNET_TXQ_LOCK(txq); if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { VTNET_TXQ_UNLOCK(txq); return; } vtnet_txq_eof(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } static int vtnet_txq_eof(struct vtnet_txq *txq) { struct virtqueue *vq; struct vtnet_tx_header *txhdr; struct mbuf *m; int deq; vq = txq->vtntx_vq; deq = 0; VTNET_TXQ_LOCK_ASSERT(txq); while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) { m = txhdr->vth_mbuf; deq++; txq->vtntx_stats.vtxs_opackets++; txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) txq->vtntx_stats.vtxs_omcasts++; m_freem(m); uma_zfree(vtnet_tx_header_zone, txhdr); } if (virtqueue_empty(vq)) txq->vtntx_watchdog = 0; return (deq); } static void vtnet_tx_vq_intr(void *xtxq) { struct vtnet_softc *sc; struct vtnet_txq *txq; if_t ifp; txq = xtxq; sc = txq->vtntx_sc; ifp = sc->vtnet_ifp; if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) { /* * Ignore this interrupt. Either this is a spurious interrupt * or multiqueue without per-VQ MSIX so every queue needs to * be polled (a brain dead configuration we could try harder * to avoid). */ vtnet_txq_disable_intr(txq); return; } #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, txq->vtntx_id) != NM_IRQ_PASS) return; #endif /* DEV_NETMAP */ VTNET_TXQ_LOCK(txq); if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { VTNET_TXQ_UNLOCK(txq); return; } vtnet_txq_eof(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } static void vtnet_tx_start_all(struct vtnet_softc *sc) { struct vtnet_txq *txq; int i; VTNET_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); vtnet_txq_start(txq); VTNET_TXQ_UNLOCK(txq); } } #ifndef VTNET_LEGACY_TX static void vtnet_qflush(if_t ifp) { struct vtnet_softc *sc; struct vtnet_txq *txq; struct mbuf *m; int i; sc = if_getsoftc(ifp); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL) m_freem(m); VTNET_TXQ_UNLOCK(txq); } if_qflush(ifp); } #endif static int vtnet_watchdog(struct vtnet_txq *txq) { if_t ifp; ifp = txq->vtntx_sc->vtnet_ifp; VTNET_TXQ_LOCK(txq); if (txq->vtntx_watchdog == 1) { /* * Only drain completed frames if the watchdog is about to * expire. If any frames were drained, there may be enough * free descriptors now available to transmit queued frames. * In that case, the timer will immediately be decremented * below, but the timeout is generous enough that should not * be a problem. */ if (vtnet_txq_eof(txq) != 0) vtnet_txq_start(txq); } if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) { VTNET_TXQ_UNLOCK(txq); return (0); } VTNET_TXQ_UNLOCK(txq); if_printf(ifp, "watchdog timeout on queue %d\n", txq->vtntx_id); return (1); } static void vtnet_accum_stats(struct vtnet_softc *sc, struct vtnet_rxq_stats *rxacc, struct vtnet_txq_stats *txacc) { bzero(rxacc, sizeof(struct vtnet_rxq_stats)); bzero(txacc, sizeof(struct vtnet_txq_stats)); for (int i = 0; i < sc->vtnet_max_vq_pairs; i++) { struct vtnet_rxq_stats *rxst; struct vtnet_txq_stats *txst; rxst = &sc->vtnet_rxqs[i].vtnrx_stats; rxacc->vrxs_ipackets += rxst->vrxs_ipackets; rxacc->vrxs_ibytes += rxst->vrxs_ibytes; rxacc->vrxs_iqdrops += rxst->vrxs_iqdrops; rxacc->vrxs_csum += rxst->vrxs_csum; rxacc->vrxs_csum_failed += rxst->vrxs_csum_failed; rxacc->vrxs_rescheduled += rxst->vrxs_rescheduled; txst = &sc->vtnet_txqs[i].vtntx_stats; txacc->vtxs_opackets += txst->vtxs_opackets; txacc->vtxs_obytes += txst->vtxs_obytes; txacc->vtxs_csum += txst->vtxs_csum; txacc->vtxs_tso += txst->vtxs_tso; txacc->vtxs_rescheduled += txst->vtxs_rescheduled; } } static uint64_t vtnet_get_counter(if_t ifp, ift_counter cnt) { struct vtnet_softc *sc; struct vtnet_rxq_stats rxaccum; struct vtnet_txq_stats txaccum; sc = if_getsoftc(ifp); vtnet_accum_stats(sc, &rxaccum, &txaccum); switch (cnt) { case IFCOUNTER_IPACKETS: return (rxaccum.vrxs_ipackets); case IFCOUNTER_IQDROPS: return (rxaccum.vrxs_iqdrops); case IFCOUNTER_IERRORS: return (rxaccum.vrxs_ierrors); case IFCOUNTER_OPACKETS: return (txaccum.vtxs_opackets); #ifndef VTNET_LEGACY_TX case IFCOUNTER_OBYTES: return (txaccum.vtxs_obytes); case IFCOUNTER_OMCASTS: return (txaccum.vtxs_omcasts); #endif default: return (if_get_counter_default(ifp, cnt)); } } static void vtnet_tick(void *xsc) { struct vtnet_softc *sc; if_t ifp; int i, timedout; sc = xsc; ifp = sc->vtnet_ifp; timedout = 0; VTNET_CORE_LOCK_ASSERT(sc); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]); if (timedout != 0) { if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); vtnet_init_locked(sc, 0); } else callout_schedule(&sc->vtnet_tick_ch, hz); } static void vtnet_start_taskqueues(struct vtnet_softc *sc) { device_t dev; struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i, error; dev = sc->vtnet_dev; /* * Errors here are very difficult to recover from - we cannot * easily fail because, if this is during boot, we will hang * when freeing any successfully started taskqueues because * the scheduler isn't up yet. * * Most drivers just ignore the return value - it only fails * with ENOMEM so an error is not likely. */ for (i = 0; i < sc->vtnet_req_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id); if (error) { device_printf(dev, "failed to start rx taskq %d\n", rxq->vtnrx_id); } txq = &sc->vtnet_txqs[i]; error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET, "%s txq %d", device_get_nameunit(dev), txq->vtntx_id); if (error) { device_printf(dev, "failed to start tx taskq %d\n", txq->vtntx_id); } } } static void vtnet_free_taskqueues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; if (rxq->vtnrx_tq != NULL) { taskqueue_free(rxq->vtnrx_tq); rxq->vtnrx_tq = NULL; } txq = &sc->vtnet_txqs[i]; if (txq->vtntx_tq != NULL) { taskqueue_free(txq->vtntx_tq); txq->vtntx_tq = NULL; } } } static void vtnet_drain_taskqueues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; if (rxq->vtnrx_tq != NULL) taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); txq = &sc->vtnet_txqs[i]; if (txq->vtntx_tq != NULL) { taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask); #ifndef VTNET_LEGACY_TX taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask); #endif } } } static void vtnet_drain_rxtx_queues(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; vtnet_rxq_free_mbufs(rxq); txq = &sc->vtnet_txqs[i]; vtnet_txq_free_mbufs(txq); } } static void vtnet_stop_rendezvous(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; struct vtnet_txq *txq; int i; VTNET_CORE_LOCK_ASSERT(sc); /* * Lock and unlock the per-queue mutex so we known the stop * state is visible. Doing only the active queues should be * sufficient, but it does not cost much extra to do all the * queues. */ for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; VTNET_RXQ_LOCK(rxq); VTNET_RXQ_UNLOCK(rxq); txq = &sc->vtnet_txqs[i]; VTNET_TXQ_LOCK(txq); VTNET_TXQ_UNLOCK(txq); } } static void vtnet_stop(struct vtnet_softc *sc) { device_t dev; if_t ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); sc->vtnet_link_active = 0; callout_stop(&sc->vtnet_tick_ch); /* Only advisory. */ vtnet_disable_interrupts(sc); #ifdef DEV_NETMAP /* Stop any pending txsync/rxsync and disable them. */ netmap_disable_all_rings(ifp); #endif /* DEV_NETMAP */ /* * Stop the host adapter. This resets it to the pre-initialized * state. It will not generate any interrupts until after it is * reinitialized. */ virtio_stop(dev); vtnet_stop_rendezvous(sc); vtnet_drain_rxtx_queues(sc); sc->vtnet_act_vq_pairs = 1; } static int vtnet_virtio_reinit(struct vtnet_softc *sc) { device_t dev; if_t ifp; uint64_t features; int error; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; features = sc->vtnet_negotiated_features; /* * Re-negotiate with the host, removing any disabled receive * features. Transmit features are disabled only on our side * via if_capenable and if_hwassist. */ if ((if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) == 0) features &= ~(VIRTIO_NET_F_GUEST_CSUM | VTNET_LRO_FEATURES); if ((if_getcapenable(ifp) & IFCAP_LRO) == 0) features &= ~VTNET_LRO_FEATURES; if ((if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) == 0) features &= ~VIRTIO_NET_F_CTRL_VLAN; error = virtio_reinit(dev, features); if (error) { device_printf(dev, "virtio reinit error %d\n", error); return (error); } sc->vtnet_features = features; virtio_reinit_complete(dev); return (0); } static void vtnet_init_rx_filters(struct vtnet_softc *sc) { if_t ifp; ifp = sc->vtnet_ifp; if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { vtnet_rx_filter(sc); vtnet_rx_filter_mac(sc); } if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) vtnet_rx_filter_vlan(sc); } static int vtnet_init_rx_queues(struct vtnet_softc *sc) { device_t dev; if_t ifp; struct vtnet_rxq *rxq; int i, clustersz, error; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; clustersz = vtnet_rx_cluster_size(sc, if_getmtu(ifp)); sc->vtnet_rx_clustersz = clustersz; if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) { sc->vtnet_rx_nmbufs = howmany(sizeof(struct vtnet_rx_header) + VTNET_MAX_RX_SIZE, clustersz); KASSERT(sc->vtnet_rx_nmbufs < sc->vtnet_rx_nsegs, ("%s: too many rx mbufs %d for %d segments", __func__, sc->vtnet_rx_nmbufs, sc->vtnet_rx_nsegs)); } else sc->vtnet_rx_nmbufs = 1; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; /* Hold the lock to satisfy asserts. */ VTNET_RXQ_LOCK(rxq); error = vtnet_rxq_populate(rxq); VTNET_RXQ_UNLOCK(rxq); if (error) { device_printf(dev, "cannot populate Rx queue %d\n", i); return (error); } } return (0); } static int vtnet_init_tx_queues(struct vtnet_softc *sc) { struct vtnet_txq *txq; int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { txq = &sc->vtnet_txqs[i]; txq->vtntx_watchdog = 0; txq->vtntx_intr_threshold = vtnet_txq_intr_threshold(txq); #ifdef DEV_NETMAP netmap_reset(NA(sc->vtnet_ifp), NR_TX, i, 0); #endif /* DEV_NETMAP */ } return (0); } static int vtnet_init_rxtx_queues(struct vtnet_softc *sc) { int error; error = vtnet_init_rx_queues(sc); if (error) return (error); error = vtnet_init_tx_queues(sc); if (error) return (error); return (0); } static void vtnet_set_active_vq_pairs(struct vtnet_softc *sc) { device_t dev; int npairs; dev = sc->vtnet_dev; if ((sc->vtnet_flags & VTNET_FLAG_MQ) == 0) { sc->vtnet_act_vq_pairs = 1; return; } npairs = sc->vtnet_req_vq_pairs; if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) { device_printf(dev, "cannot set active queue pairs to %d, " "falling back to 1 queue pair\n", npairs); npairs = 1; } sc->vtnet_act_vq_pairs = npairs; } static void vtnet_update_rx_offloads(struct vtnet_softc *sc) { if_t ifp; uint64_t features; int error; ifp = sc->vtnet_ifp; features = sc->vtnet_features; VTNET_CORE_LOCK_ASSERT(sc); if (if_getcapabilities(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) { if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) features |= VIRTIO_NET_F_GUEST_CSUM; else features &= ~VIRTIO_NET_F_GUEST_CSUM; } if (if_getcapabilities(ifp) & IFCAP_LRO && !vtnet_software_lro(sc)) { if (if_getcapenable(ifp) & IFCAP_LRO) features |= VTNET_LRO_FEATURES; else features &= ~VTNET_LRO_FEATURES; } error = vtnet_ctrl_guest_offloads(sc, features & (VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN | VIRTIO_NET_F_GUEST_UFO)); if (error) { device_printf(sc->vtnet_dev, "%s: cannot update Rx features\n", __func__); if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); vtnet_init_locked(sc, 0); } } else sc->vtnet_features = features; } static int vtnet_reinit(struct vtnet_softc *sc) { if_t ifp; int error; ifp = sc->vtnet_ifp; bcopy(if_getlladdr(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN); error = vtnet_virtio_reinit(sc); if (error) return (error); vtnet_set_macaddr(sc); vtnet_set_active_vq_pairs(sc); if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) vtnet_init_rx_filters(sc); if_sethwassist(ifp, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, VTNET_CSUM_OFFLOAD, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, VTNET_CSUM_OFFLOAD_IPV6, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); error = vtnet_init_rxtx_queues(sc); if (error) return (error); return (0); } static void vtnet_init_locked(struct vtnet_softc *sc, int init_mode) { if_t ifp; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) return; vtnet_stop(sc); #ifdef DEV_NETMAP /* Once stopped we can update the netmap flags, if necessary. */ switch (init_mode) { case VTNET_INIT_NETMAP_ENTER: nm_set_native_flags(NA(ifp)); break; case VTNET_INIT_NETMAP_EXIT: nm_clear_native_flags(NA(ifp)); break; } #endif /* DEV_NETMAP */ if (vtnet_reinit(sc) != 0) { vtnet_stop(sc); return; } if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); vtnet_update_link_status(sc); vtnet_enable_interrupts(sc); callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc); #ifdef DEV_NETMAP /* Re-enable txsync/rxsync. */ netmap_enable_all_rings(ifp); #endif /* DEV_NETMAP */ } static void vtnet_init(void *xsc) { struct vtnet_softc *sc; sc = xsc; VTNET_CORE_LOCK(sc); vtnet_init_locked(sc, 0); VTNET_CORE_UNLOCK(sc); } static void vtnet_free_ctrl_vq(struct vtnet_softc *sc) { /* * The control virtqueue is only polled and therefore it should * already be empty. */ KASSERT(virtqueue_empty(sc->vtnet_ctrl_vq), ("%s: ctrl vq %p not empty", __func__, sc->vtnet_ctrl_vq)); } static void vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie, struct sglist *sg, int readable, int writable) { struct virtqueue *vq; vq = sc->vtnet_ctrl_vq; MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ); VTNET_CORE_LOCK_ASSERT(sc); if (!virtqueue_empty(vq)) return; /* * Poll for the response, but the command is likely completed before * returning from the notify. */ if (virtqueue_enqueue(vq, cookie, sg, readable, writable) == 0) { virtqueue_notify(vq); virtqueue_poll(vq, NULL); } } static int vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr __aligned(2); uint8_t pad1; uint8_t addr[ETHER_ADDR_LEN] __aligned(8); uint8_t pad2; uint8_t ack; } s; int error; error = 0; MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_MAC); s.hdr.class = VIRTIO_NET_CTRL_MAC; s.hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET; bcopy(hwaddr, &s.addr[0], ETHER_ADDR_LEN); s.ack = VIRTIO_NET_ERR; sglist_init(&sg, nitems(segs), segs); error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.addr[0], ETHER_ADDR_LEN); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); MPASS(error == 0 && sg.sg_nseg == nitems(segs)); if (error == 0) vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_ctrl_guest_offloads(struct vtnet_softc *sc, uint64_t offloads) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr __aligned(2); uint8_t pad1; uint64_t offloads __aligned(8); uint8_t pad2; uint8_t ack; } s; int error; error = 0; MPASS(sc->vtnet_features & VIRTIO_NET_F_CTRL_GUEST_OFFLOADS); s.hdr.class = VIRTIO_NET_CTRL_GUEST_OFFLOADS; s.hdr.cmd = VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET; s.offloads = vtnet_gtoh64(sc, offloads); s.ack = VIRTIO_NET_ERR; sglist_init(&sg, nitems(segs), segs); error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.offloads, sizeof(uint64_t)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); MPASS(error == 0 && sg.sg_nseg == nitems(segs)); if (error == 0) vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr __aligned(2); uint8_t pad1; struct virtio_net_ctrl_mq mq __aligned(2); uint8_t pad2; uint8_t ack; } s; int error; error = 0; MPASS(sc->vtnet_flags & VTNET_FLAG_MQ); s.hdr.class = VIRTIO_NET_CTRL_MQ; s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET; s.mq.virtqueue_pairs = vtnet_gtoh16(sc, npairs); s.ack = VIRTIO_NET_ERR; sglist_init(&sg, nitems(segs), segs); error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); MPASS(error == 0 && sg.sg_nseg == nitems(segs)); if (error == 0) vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, uint8_t cmd, bool on) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr __aligned(2); uint8_t pad1; uint8_t onoff; uint8_t pad2; uint8_t ack; } s; int error; error = 0; MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_RX); s.hdr.class = VIRTIO_NET_CTRL_RX; s.hdr.cmd = cmd; s.onoff = on; s.ack = VIRTIO_NET_ERR; sglist_init(&sg, nitems(segs), segs); error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); MPASS(error == 0 && sg.sg_nseg == nitems(segs)); if (error == 0) vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static int vtnet_set_promisc(struct vtnet_softc *sc, bool on) { return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on)); } static int vtnet_set_allmulti(struct vtnet_softc *sc, bool on) { return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on)); } static void vtnet_rx_filter(struct vtnet_softc *sc) { device_t dev; if_t ifp; dev = sc->vtnet_dev; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); if (vtnet_set_promisc(sc, if_getflags(ifp) & IFF_PROMISC) != 0) { device_printf(dev, "cannot %s promiscuous mode\n", if_getflags(ifp) & IFF_PROMISC ? "enable" : "disable"); } if (vtnet_set_allmulti(sc, if_getflags(ifp) & IFF_ALLMULTI) != 0) { device_printf(dev, "cannot %s all-multicast mode\n", if_getflags(ifp) & IFF_ALLMULTI ? "enable" : "disable"); } } static u_int vtnet_copy_ifaddr(void *arg, struct sockaddr_dl *sdl, u_int ucnt) { struct vtnet_softc *sc = arg; if (memcmp(LLADDR(sdl), sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0) return (0); if (ucnt < VTNET_MAX_MAC_ENTRIES) bcopy(LLADDR(sdl), &sc->vtnet_mac_filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN); return (1); } static u_int vtnet_copy_maddr(void *arg, struct sockaddr_dl *sdl, u_int mcnt) { struct vtnet_mac_filter *filter = arg; if (mcnt < VTNET_MAX_MAC_ENTRIES) bcopy(LLADDR(sdl), &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN); return (1); } static void vtnet_rx_filter_mac(struct vtnet_softc *sc) { struct virtio_net_ctrl_hdr hdr __aligned(2); struct vtnet_mac_filter *filter; struct sglist_seg segs[4]; struct sglist sg; if_t ifp; bool promisc, allmulti; u_int ucnt, mcnt; int error; uint8_t ack; ifp = sc->vtnet_ifp; filter = sc->vtnet_mac_filter; error = 0; MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_RX); VTNET_CORE_LOCK_ASSERT(sc); /* Unicast MAC addresses: */ ucnt = if_foreach_lladdr(ifp, vtnet_copy_ifaddr, sc); promisc = (ucnt > VTNET_MAX_MAC_ENTRIES); if (promisc) { ucnt = 0; if_printf(ifp, "more than %d MAC addresses assigned, " "falling back to promiscuous mode\n", VTNET_MAX_MAC_ENTRIES); } /* Multicast MAC addresses: */ mcnt = if_foreach_llmaddr(ifp, vtnet_copy_maddr, filter); allmulti = (mcnt > VTNET_MAX_MAC_ENTRIES); if (allmulti) { mcnt = 0; if_printf(ifp, "more than %d multicast MAC addresses " "assigned, falling back to all-multicast mode\n", VTNET_MAX_MAC_ENTRIES); } if (promisc && allmulti) goto out; filter->vmf_unicast.nentries = vtnet_gtoh32(sc, ucnt); filter->vmf_multicast.nentries = vtnet_gtoh32(sc, mcnt); hdr.class = VIRTIO_NET_CTRL_MAC; hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET; ack = VIRTIO_NET_ERR; sglist_init(&sg, nitems(segs), segs); error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &filter->vmf_unicast, sizeof(uint32_t) + ucnt * ETHER_ADDR_LEN); error |= sglist_append(&sg, &filter->vmf_multicast, sizeof(uint32_t) + mcnt * ETHER_ADDR_LEN); error |= sglist_append(&sg, &ack, sizeof(uint8_t)); MPASS(error == 0 && sg.sg_nseg == nitems(segs)); if (error == 0) vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); if (ack != VIRTIO_NET_OK) if_printf(ifp, "error setting host MAC filter table\n"); out: if (promisc != 0 && vtnet_set_promisc(sc, true) != 0) if_printf(ifp, "cannot enable promiscuous mode\n"); if (allmulti != 0 && vtnet_set_allmulti(sc, true) != 0) if_printf(ifp, "cannot enable all-multicast mode\n"); } static int vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { struct sglist_seg segs[3]; struct sglist sg; struct { struct virtio_net_ctrl_hdr hdr __aligned(2); uint8_t pad1; uint16_t tag __aligned(2); uint8_t pad2; uint8_t ack; } s; int error; error = 0; MPASS(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER); s.hdr.class = VIRTIO_NET_CTRL_VLAN; s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL; s.tag = vtnet_gtoh16(sc, tag); s.ack = VIRTIO_NET_ERR; sglist_init(&sg, nitems(segs), segs); error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr)); error |= sglist_append(&sg, &s.tag, sizeof(uint16_t)); error |= sglist_append(&sg, &s.ack, sizeof(uint8_t)); MPASS(error == 0 && sg.sg_nseg == nitems(segs)); if (error == 0) vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1); return (s.ack == VIRTIO_NET_OK ? 0 : EIO); } static void vtnet_rx_filter_vlan(struct vtnet_softc *sc) { int i, bit; uint32_t w; uint16_t tag; MPASS(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER); VTNET_CORE_LOCK_ASSERT(sc); /* Enable the filter for each configured VLAN. */ for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) { w = sc->vtnet_vlan_filter[i]; while ((bit = ffs(w) - 1) != -1) { w &= ~(1 << bit); tag = sizeof(w) * CHAR_BIT * i + bit; if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) { device_printf(sc->vtnet_dev, "cannot enable VLAN %d filter\n", tag); } } } } static void vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) { if_t ifp; int idx, bit; ifp = sc->vtnet_ifp; idx = (tag >> 5) & 0x7F; bit = tag & 0x1F; if (tag == 0 || tag > 4095) return; VTNET_CORE_LOCK(sc); if (add) sc->vtnet_vlan_filter[idx] |= (1 << bit); else sc->vtnet_vlan_filter[idx] &= ~(1 << bit); if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER && if_getdrvflags(ifp) & IFF_DRV_RUNNING && vtnet_exec_vlan_filter(sc, add, tag) != 0) { device_printf(sc->vtnet_dev, "cannot %s VLAN %d %s the host filter table\n", add ? "add" : "remove", tag, add ? "to" : "from"); } VTNET_CORE_UNLOCK(sc); } static void vtnet_register_vlan(void *arg, if_t ifp, uint16_t tag) { if (if_getsoftc(ifp) != arg) return; vtnet_update_vlan_filter(arg, 1, tag); } static void vtnet_unregister_vlan(void *arg, if_t ifp, uint16_t tag) { if (if_getsoftc(ifp) != arg) return; vtnet_update_vlan_filter(arg, 0, tag); } static void vtnet_update_speed_duplex(struct vtnet_softc *sc) { if_t ifp; uint32_t speed; ifp = sc->vtnet_ifp; if ((sc->vtnet_features & VIRTIO_NET_F_SPEED_DUPLEX) == 0) return; /* BMV: Ignore duplex. */ speed = virtio_read_dev_config_4(sc->vtnet_dev, offsetof(struct virtio_net_config, speed)); if (speed != UINT32_MAX) if_setbaudrate(ifp, IF_Mbps(speed)); } static int vtnet_is_link_up(struct vtnet_softc *sc) { uint16_t status; if ((sc->vtnet_features & VIRTIO_NET_F_STATUS) == 0) return (1); status = virtio_read_dev_config_2(sc->vtnet_dev, offsetof(struct virtio_net_config, status)); return ((status & VIRTIO_NET_S_LINK_UP) != 0); } static void vtnet_update_link_status(struct vtnet_softc *sc) { if_t ifp; int link; ifp = sc->vtnet_ifp; VTNET_CORE_LOCK_ASSERT(sc); link = vtnet_is_link_up(sc); /* Notify if the link status has changed. */ if (link != 0 && sc->vtnet_link_active == 0) { vtnet_update_speed_duplex(sc); sc->vtnet_link_active = 1; if_link_state_change(ifp, LINK_STATE_UP); } else if (link == 0 && sc->vtnet_link_active != 0) { sc->vtnet_link_active = 0; if_link_state_change(ifp, LINK_STATE_DOWN); } } static int vtnet_ifmedia_upd(if_t ifp __unused) { return (EOPNOTSUPP); } static void vtnet_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) { struct vtnet_softc *sc; sc = if_getsoftc(ifp); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; VTNET_CORE_LOCK(sc); if (vtnet_is_link_up(sc) != 0) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } else ifmr->ifm_active |= IFM_NONE; VTNET_CORE_UNLOCK(sc); } static void vtnet_get_macaddr(struct vtnet_softc *sc) { if (sc->vtnet_flags & VTNET_FLAG_MAC) { virtio_read_device_config_array(sc->vtnet_dev, offsetof(struct virtio_net_config, mac), &sc->vtnet_hwaddr[0], sizeof(uint8_t), ETHER_ADDR_LEN); } else { /* Generate a random locally administered unicast address. */ sc->vtnet_hwaddr[0] = 0xB2; arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0); } } static void vtnet_set_macaddr(struct vtnet_softc *sc) { device_t dev; int error; dev = sc->vtnet_dev; if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) { error = vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr); if (error) device_printf(dev, "unable to set MAC address\n"); return; } /* MAC in config is read-only in modern VirtIO. */ if (!vtnet_modern(sc) && sc->vtnet_flags & VTNET_FLAG_MAC) { for (int i = 0; i < ETHER_ADDR_LEN; i++) { virtio_write_dev_config_1(dev, offsetof(struct virtio_net_config, mac) + i, sc->vtnet_hwaddr[i]); } } } static void vtnet_attached_set_macaddr(struct vtnet_softc *sc) { /* Assign MAC address if it was generated. */ if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0) vtnet_set_macaddr(sc); } static void vtnet_vlan_tag_remove(struct mbuf *m) { struct ether_vlan_header *evh; evh = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag); m->m_flags |= M_VLANTAG; /* Strip the 802.1Q header. */ bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } static void vtnet_set_rx_process_limit(struct vtnet_softc *sc) { int limit; limit = vtnet_tunable_int(sc, "rx_process_limit", vtnet_rx_process_limit); if (limit < 0) limit = INT_MAX; sc->vtnet_rx_process_limit = limit; } static void vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_rxq *rxq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct vtnet_rxq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue"); list = SYSCTL_CHILDREN(node); stats = &rxq->vtnrx_stats; SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD, &stats->vrxs_ipackets, "Receive packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD, &stats->vrxs_ibytes, "Receive bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD, &stats->vrxs_iqdrops, "Receive drops"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD, &stats->vrxs_ierrors, "Receive errors"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vrxs_csum, "Receive checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD, &stats->vrxs_csum_failed, "Receive checksum offload failed"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "host_lro", CTLFLAG_RD, &stats->vrxs_host_lro, "Receive host segmentation offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, &stats->vrxs_rescheduled, "Receive interrupt handler rescheduled"); } static void vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_txq *txq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct vtnet_txq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue"); list = SYSCTL_CHILDREN(node); stats = &txq->vtntx_stats; SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD, &stats->vtxs_opackets, "Transmit packets"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD, &stats->vtxs_obytes, "Transmit bytes"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD, &stats->vtxs_omcasts, "Transmit multicasts"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD, &stats->vtxs_csum, "Transmit checksum offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD, &stats->vtxs_tso, "Transmit TCP segmentation offloaded"); SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD, &stats->vtxs_rescheduled, "Transmit interrupt handler rescheduled"); } static void vtnet_setup_queue_sysctl(struct vtnet_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; int i; dev = sc->vtnet_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); for (i = 0; i < sc->vtnet_req_vq_pairs; i++) { vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]); vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]); } } static void vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct vtnet_softc *sc) { struct vtnet_statistics *stats; struct vtnet_rxq_stats rxaccum; struct vtnet_txq_stats txaccum; vtnet_accum_stats(sc, &rxaccum, &txaccum); stats = &sc->vtnet_stats; stats->rx_csum_offloaded = rxaccum.vrxs_csum; stats->rx_csum_failed = rxaccum.vrxs_csum_failed; stats->rx_task_rescheduled = rxaccum.vrxs_rescheduled; stats->tx_csum_offloaded = txaccum.vtxs_csum; stats->tx_tso_offloaded = txaccum.vtxs_tso; stats->tx_task_rescheduled = txaccum.vtxs_rescheduled; SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed", CTLFLAG_RD, &stats->mbuf_alloc_failed, "Mbuf cluster allocation failures"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large", CTLFLAG_RD, &stats->rx_frame_too_large, "Received frame larger than the mbuf chain"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed", CTLFLAG_RD, &stats->rx_enq_replacement_failed, "Enqueuing the replacement receive mbuf failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed", CTLFLAG_RD, &stats->rx_mergeable_failed, "Mergeable buffers receive failures"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype", CTLFLAG_RD, &stats->rx_csum_bad_ethtype, "Received checksum offloaded buffer with unsupported " "Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto", CTLFLAG_RD, &stats->rx_csum_bad_ipproto, "Received checksum offloaded buffer with incorrect IP protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset", CTLFLAG_RD, &stats->rx_csum_bad_offset, "Received checksum offloaded buffer with incorrect offset"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto", CTLFLAG_RD, &stats->rx_csum_bad_proto, "Received checksum offloaded buffer with incorrect protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed", CTLFLAG_RD, &stats->rx_csum_failed, "Received buffer checksum offload failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded", CTLFLAG_RD, &stats->rx_csum_offloaded, "Received buffer checksum offload succeeded"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled", CTLFLAG_RD, &stats->rx_task_rescheduled, "Times the receive interrupt task rescheduled itself"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_unknown_ethtype", CTLFLAG_RD, &stats->tx_csum_unknown_ethtype, "Aborted transmit of checksum offloaded buffer with unknown " "Ethernet type"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_proto_mismatch", CTLFLAG_RD, &stats->tx_csum_proto_mismatch, "Aborted transmit of checksum offloaded buffer because mismatched " "protocols"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp", CTLFLAG_RD, &stats->tx_tso_not_tcp, "Aborted transmit of TSO buffer with non TCP protocol"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_without_csum", CTLFLAG_RD, &stats->tx_tso_without_csum, "Aborted transmit of TSO buffer without TCP checksum offload"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defragged", CTLFLAG_RD, &stats->tx_defragged, "Transmit mbufs defragged"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defrag_failed", CTLFLAG_RD, &stats->tx_defrag_failed, "Aborted transmit of buffer because defrag failed"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded", CTLFLAG_RD, &stats->tx_csum_offloaded, "Offloaded checksum of transmitted buffer"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded", CTLFLAG_RD, &stats->tx_tso_offloaded, "Segmentation offload of transmitted buffer"); SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled", CTLFLAG_RD, &stats->tx_task_rescheduled, "Times the transmit interrupt task rescheduled itself"); } static void vtnet_setup_sysctl(struct vtnet_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vtnet_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs", CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0, "Number of maximum supported virtqueue pairs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "req_vq_pairs", CTLFLAG_RD, &sc->vtnet_req_vq_pairs, 0, "Number of requested virtqueue pairs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs", CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0, "Number of active virtqueue pairs"); vtnet_setup_stat_sysctl(ctx, child, sc); } static void vtnet_load_tunables(struct vtnet_softc *sc) { sc->vtnet_lro_entry_count = vtnet_tunable_int(sc, "lro_entry_count", vtnet_lro_entry_count); if (sc->vtnet_lro_entry_count < TCP_LRO_ENTRIES) sc->vtnet_lro_entry_count = TCP_LRO_ENTRIES; sc->vtnet_lro_mbufq_depth = vtnet_tunable_int(sc, "lro_mbufq_depth", vtnet_lro_mbufq_depth); } static int vtnet_rxq_enable_intr(struct vtnet_rxq *rxq) { return (virtqueue_enable_intr(rxq->vtnrx_vq)); } static void vtnet_rxq_disable_intr(struct vtnet_rxq *rxq) { virtqueue_disable_intr(rxq->vtnrx_vq); } static int vtnet_txq_enable_intr(struct vtnet_txq *txq) { struct virtqueue *vq; vq = txq->vtntx_vq; if (vtnet_txq_below_threshold(txq) != 0) return (virtqueue_postpone_intr(vq, VQ_POSTPONE_LONG)); /* * The free count is above our threshold. Keep the Tx interrupt * disabled until the queue is fuller. */ return (0); } static void vtnet_txq_disable_intr(struct vtnet_txq *txq) { virtqueue_disable_intr(txq->vtntx_vq); } static void vtnet_enable_rx_interrupts(struct vtnet_softc *sc) { struct vtnet_rxq *rxq; int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) { rxq = &sc->vtnet_rxqs[i]; if (vtnet_rxq_enable_intr(rxq) != 0) taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask); } } static void vtnet_enable_tx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_act_vq_pairs; i++) vtnet_txq_enable_intr(&sc->vtnet_txqs[i]); } static void vtnet_enable_interrupts(struct vtnet_softc *sc) { vtnet_enable_rx_interrupts(sc); vtnet_enable_tx_interrupts(sc); } static void vtnet_disable_rx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]); } static void vtnet_disable_tx_interrupts(struct vtnet_softc *sc) { int i; for (i = 0; i < sc->vtnet_max_vq_pairs; i++) vtnet_txq_disable_intr(&sc->vtnet_txqs[i]); } static void vtnet_disable_interrupts(struct vtnet_softc *sc) { vtnet_disable_rx_interrupts(sc); vtnet_disable_tx_interrupts(sc); } static int vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob); TUNABLE_INT_FETCH(path, &def); return (def); } #ifdef DEBUGNET static void vtnet_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize) { struct vtnet_softc *sc; sc = if_getsoftc(ifp); VTNET_CORE_LOCK(sc); *nrxr = sc->vtnet_req_vq_pairs; *ncl = DEBUGNET_MAX_IN_FLIGHT; *clsize = sc->vtnet_rx_clustersz; VTNET_CORE_UNLOCK(sc); } static void vtnet_debugnet_event(if_t ifp __unused, enum debugnet_ev event) { struct vtnet_softc *sc; static bool sw_lro_enabled = false; /* * Disable software LRO, since it would require entering the network * epoch when calling vtnet_txq_eof() in vtnet_debugnet_poll(). */ sc = if_getsoftc(ifp); switch (event) { case DEBUGNET_START: sw_lro_enabled = (sc->vtnet_flags & VTNET_FLAG_SW_LRO) != 0; if (sw_lro_enabled) sc->vtnet_flags &= ~VTNET_FLAG_SW_LRO; break; case DEBUGNET_END: if (sw_lro_enabled) sc->vtnet_flags |= VTNET_FLAG_SW_LRO; break; } } static int vtnet_debugnet_transmit(if_t ifp, struct mbuf *m) { struct vtnet_softc *sc; struct vtnet_txq *txq; int error; sc = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); txq = &sc->vtnet_txqs[0]; error = vtnet_txq_encap(txq, &m, M_NOWAIT | M_USE_RESERVE); if (error == 0) (void)vtnet_txq_notify(txq); return (error); } static int vtnet_debugnet_poll(if_t ifp, int count) { struct vtnet_softc *sc; int i; sc = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (EBUSY); (void)vtnet_txq_eof(&sc->vtnet_txqs[0]); for (i = 0; i < sc->vtnet_act_vq_pairs; i++) (void)vtnet_rxq_eof(&sc->vtnet_rxqs[i]); return (0); } #endif /* DEBUGNET */ diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index 5a4954e84869..9f99434dd4e0 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -1,3798 +1,3804 @@ /* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Jason R. Thorpe for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp */ /* * Network interface bridge support. * * TODO: * * - Currently only supports Ethernet-like interfaces (Ethernet, * 802.11, VLANs on Ethernet, etc.) Figure out a nice way * to bridge other types of interfaces (maybe consider * heterogeneous bridges). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include /* for net/if.h */ #include #include /* string functions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #if defined(INET) || defined(INET6) #include #endif #include #include #include #include #include #include #include #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* * Size of the route hash table. Must be a power of two. */ #ifndef BRIDGE_RTHASH_SIZE #define BRIDGE_RTHASH_SIZE 1024 #endif #define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) /* * Default maximum number of addresses to cache. */ #ifndef BRIDGE_RTABLE_MAX #define BRIDGE_RTABLE_MAX 2000 #endif /* * Timeout (in seconds) for entries learned dynamically. */ #ifndef BRIDGE_RTABLE_TIMEOUT #define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */ #endif /* * Number of seconds between walks of the route list. */ #ifndef BRIDGE_RTABLE_PRUNE_PERIOD #define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) #endif /* * List of capabilities to possibly mask on the member interface. */ #define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM|\ IFCAP_TXCSUM_IPV6) /* * List of capabilities to strip */ #define BRIDGE_IFCAPS_STRIP IFCAP_LRO /* * Bridge locking * * The bridge relies heavily on the epoch(9) system to protect its data * structures. This means we can safely use CK_LISTs while in NET_EPOCH, but we * must ensure there is only one writer at a time. * * That is: for read accesses we only need to be in NET_EPOCH, but for write * accesses we must hold: * * - BRIDGE_RT_LOCK, for any change to bridge_rtnodes * - BRIDGE_LOCK, for any other change * * The BRIDGE_LOCK is a sleepable lock, because it is held across ioctl() * calls to bridge member interfaces and these ioctl()s can sleep. * The BRIDGE_RT_LOCK is a non-sleepable mutex, because it is sometimes * required while we're in NET_EPOCH and then we're not allowed to sleep. */ #define BRIDGE_LOCK_INIT(_sc) do { \ sx_init(&(_sc)->sc_sx, "if_bridge"); \ mtx_init(&(_sc)->sc_rt_mtx, "if_bridge rt", NULL, MTX_DEF); \ } while (0) #define BRIDGE_LOCK_DESTROY(_sc) do { \ sx_destroy(&(_sc)->sc_sx); \ mtx_destroy(&(_sc)->sc_rt_mtx); \ } while (0) #define BRIDGE_LOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define BRIDGE_UNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define BRIDGE_LOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SX_XLOCKED) #define BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(_sc) \ MPASS(in_epoch(net_epoch_preempt) || sx_xlocked(&(_sc)->sc_sx)) #define BRIDGE_UNLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SX_UNLOCKED) #define BRIDGE_RT_LOCK(_sc) mtx_lock(&(_sc)->sc_rt_mtx) #define BRIDGE_RT_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_rt_mtx) #define BRIDGE_RT_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->sc_rt_mtx, MA_OWNED) #define BRIDGE_RT_LOCK_OR_NET_EPOCH_ASSERT(_sc) \ MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(_sc)->sc_rt_mtx)) /* * Bridge interface list entry. */ struct bridge_iflist { CK_LIST_ENTRY(bridge_iflist) bif_next; struct ifnet *bif_ifp; /* member if */ struct bstp_port bif_stp; /* STP state */ uint32_t bif_flags; /* member if flags */ int bif_savedcaps; /* saved capabilities */ uint32_t bif_addrmax; /* max # of addresses */ uint32_t bif_addrcnt; /* cur. # of addresses */ uint32_t bif_addrexceeded;/* # of address violations */ struct epoch_context bif_epoch_ctx; }; /* * Bridge route node. */ struct bridge_rtnode { CK_LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ CK_LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ struct bridge_iflist *brt_dst; /* destination if */ unsigned long brt_expire; /* expiration time */ uint8_t brt_flags; /* address flags */ uint8_t brt_addr[ETHER_ADDR_LEN]; uint16_t brt_vlan; /* vlan id */ struct vnet *brt_vnet; struct epoch_context brt_epoch_ctx; }; #define brt_ifp brt_dst->bif_ifp /* * Software state for each bridge. */ struct bridge_softc { struct ifnet *sc_ifp; /* make this an interface */ LIST_ENTRY(bridge_softc) sc_list; struct sx sc_sx; struct mtx sc_rt_mtx; uint32_t sc_brtmax; /* max # of addresses */ uint32_t sc_brtcnt; /* cur. # of addresses */ uint32_t sc_brttimeout; /* rt timeout in seconds */ struct callout sc_brcallout; /* bridge callout */ CK_LIST_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ CK_LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ CK_LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ uint32_t sc_rthash_key; /* key for hash */ CK_LIST_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ struct bstp_state sc_stp; /* STP state */ uint32_t sc_brtexceeded; /* # of cache drops */ struct ifnet *sc_ifaddr; /* member mac copied from */ struct ether_addr sc_defaddr; /* Default MAC address */ struct epoch_context sc_epoch_ctx; }; VNET_DEFINE_STATIC(struct sx, bridge_list_sx); #define V_bridge_list_sx VNET(bridge_list_sx) static eventhandler_tag bridge_detach_cookie; int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; VNET_DEFINE_STATIC(uma_zone_t, bridge_rtnode_zone); #define V_bridge_rtnode_zone VNET(bridge_rtnode_zone) static int bridge_clone_create(struct if_clone *, char *, size_t, struct ifc_data *, struct ifnet **); static int bridge_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static int bridge_ioctl(struct ifnet *, u_long, caddr_t); static void bridge_mutecaps(struct bridge_softc *); static void bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *, int); static void bridge_ifdetach(void *arg __unused, struct ifnet *); static void bridge_init(void *); static void bridge_dummynet(struct mbuf *, struct ifnet *); static void bridge_stop(struct ifnet *, int); static int bridge_transmit(struct ifnet *, struct mbuf *); #ifdef ALTQ static void bridge_altq_start(if_t); static int bridge_altq_transmit(if_t, struct mbuf *); #endif static void bridge_qflush(struct ifnet *); static struct mbuf *bridge_input(struct ifnet *, struct mbuf *); static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static int bridge_enqueue(struct bridge_softc *, struct ifnet *, struct mbuf *); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, struct mbuf *m); static void bridge_timer(void *); static void bridge_broadcast(struct bridge_softc *, struct ifnet *, struct mbuf *, int); static void bridge_span(struct bridge_softc *, struct mbuf *); static int bridge_rtupdate(struct bridge_softc *, const uint8_t *, uint16_t, struct bridge_iflist *, int, uint8_t); static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *, uint16_t); static void bridge_rttrim(struct bridge_softc *); static void bridge_rtage(struct bridge_softc *); static void bridge_rtflush(struct bridge_softc *, int); static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, uint16_t); static void bridge_rtable_init(struct bridge_softc *); static void bridge_rtable_fini(struct bridge_softc *); static int bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *); static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *, const uint8_t *, uint16_t); static int bridge_rtnode_insert(struct bridge_softc *, struct bridge_rtnode *); static void bridge_rtnode_destroy(struct bridge_softc *, struct bridge_rtnode *); static void bridge_rtable_expire(struct ifnet *, int); static void bridge_state_change(struct ifnet *, int); static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, const char *name); static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *, struct ifnet *ifp); static void bridge_delete_member(struct bridge_softc *, struct bridge_iflist *, int); static void bridge_delete_span(struct bridge_softc *, struct bridge_iflist *); static int bridge_ioctl_add(struct bridge_softc *, void *); static int bridge_ioctl_del(struct bridge_softc *, void *); static int bridge_ioctl_gifflags(struct bridge_softc *, void *); static int bridge_ioctl_sifflags(struct bridge_softc *, void *); static int bridge_ioctl_scache(struct bridge_softc *, void *); static int bridge_ioctl_gcache(struct bridge_softc *, void *); static int bridge_ioctl_gifs(struct bridge_softc *, void *); static int bridge_ioctl_rts(struct bridge_softc *, void *); static int bridge_ioctl_saddr(struct bridge_softc *, void *); static int bridge_ioctl_sto(struct bridge_softc *, void *); static int bridge_ioctl_gto(struct bridge_softc *, void *); static int bridge_ioctl_daddr(struct bridge_softc *, void *); static int bridge_ioctl_flush(struct bridge_softc *, void *); static int bridge_ioctl_gpri(struct bridge_softc *, void *); static int bridge_ioctl_spri(struct bridge_softc *, void *); static int bridge_ioctl_ght(struct bridge_softc *, void *); static int bridge_ioctl_sht(struct bridge_softc *, void *); static int bridge_ioctl_gfd(struct bridge_softc *, void *); static int bridge_ioctl_sfd(struct bridge_softc *, void *); static int bridge_ioctl_gma(struct bridge_softc *, void *); static int bridge_ioctl_sma(struct bridge_softc *, void *); static int bridge_ioctl_sifprio(struct bridge_softc *, void *); static int bridge_ioctl_sifcost(struct bridge_softc *, void *); static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); static int bridge_ioctl_addspan(struct bridge_softc *, void *); static int bridge_ioctl_delspan(struct bridge_softc *, void *); static int bridge_ioctl_gbparam(struct bridge_softc *, void *); static int bridge_ioctl_grte(struct bridge_softc *, void *); static int bridge_ioctl_gifsstp(struct bridge_softc *, void *); static int bridge_ioctl_sproto(struct bridge_softc *, void *); static int bridge_ioctl_stxhc(struct bridge_softc *, void *); static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, int); static int bridge_ip_checkbasic(struct mbuf **mp); #ifdef INET6 static int bridge_ip6_checkbasic(struct mbuf **mp); #endif /* INET6 */ static int bridge_fragment(struct ifnet *, struct mbuf **mp, struct ether_header *, int, struct llc *); static void bridge_linkstate(struct ifnet *ifp); static void bridge_linkcheck(struct bridge_softc *sc); /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ #define VLANTAGOF(_m) \ (_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1 static struct bstp_cb_ops bridge_ops = { .bcb_state = bridge_state_change, .bcb_rtage = bridge_rtable_expire }; SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bridge"); /* only pass IP[46] packets when pfil is enabled */ VNET_DEFINE_STATIC(int, pfil_onlyip) = 1; #define V_pfil_onlyip VNET(pfil_onlyip) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_onlyip), 0, "Only pass IP packets when pfil is enabled"); /* run pfil hooks on the bridge interface */ VNET_DEFINE_STATIC(int, pfil_bridge) = 0; #define V_pfil_bridge VNET(pfil_bridge) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_bridge), 0, "Packet filter on the bridge interface"); /* layer2 filter with ipfw */ VNET_DEFINE_STATIC(int, pfil_ipfw); #define V_pfil_ipfw VNET(pfil_ipfw) /* layer2 ARP filter with ipfw */ VNET_DEFINE_STATIC(int, pfil_ipfw_arp); #define V_pfil_ipfw_arp VNET(pfil_ipfw_arp) SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw_arp), 0, "Filter ARP packets through IPFW layer2"); /* run pfil hooks on the member interface */ VNET_DEFINE_STATIC(int, pfil_member) = 0; #define V_pfil_member VNET(pfil_member) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_member), 0, "Packet filter on the member interface"); /* run pfil hooks on the physical interface for locally destined packets */ VNET_DEFINE_STATIC(int, pfil_local_phys); #define V_pfil_local_phys VNET(pfil_local_phys) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_local_phys), 0, "Packet filter on the physical interface for locally destined packets"); /* log STP state changes */ VNET_DEFINE_STATIC(int, log_stp); #define V_log_stp VNET(log_stp) SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(log_stp), 0, "Log STP state changes"); /* share MAC with first bridge member */ VNET_DEFINE_STATIC(int, bridge_inherit_mac); #define V_bridge_inherit_mac VNET(bridge_inherit_mac) SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(bridge_inherit_mac), 0, "Inherit MAC address from the first bridge member"); VNET_DEFINE_STATIC(int, allow_llz_overlap) = 0; #define V_allow_llz_overlap VNET(allow_llz_overlap) SYSCTL_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0, "Allow overlap of link-local scope " "zones of a bridge interface and the member interfaces"); struct bridge_control { int (*bc_func)(struct bridge_softc *, void *); int bc_argsize; int bc_flags; }; #define BC_F_COPYIN 0x01 /* copy arguments in */ #define BC_F_COPYOUT 0x02 /* copy arguments out */ #define BC_F_SUSER 0x04 /* do super-user check */ static const struct bridge_control bridge_control_table[] = { { bridge_ioctl_add, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_del, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gifflags, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_sifflags, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_scache, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gcache, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_gifs, sizeof(struct ifbifconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_rts, sizeof(struct ifbaconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_saddr, sizeof(struct ifbareq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sto, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gto, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_daddr, sizeof(struct ifbareq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_flush, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gpri, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_spri, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_ght, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sht, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gfd, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sfd, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gma, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sma, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifprio, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifcost, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_addspan, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_delspan, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gbparam, sizeof(struct ifbropreq), BC_F_COPYOUT }, { bridge_ioctl_grte, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_gifsstp, sizeof(struct ifbpstpconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_sproto, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_stxhc, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, }; static const int bridge_control_table_size = nitems(bridge_control_table); VNET_DEFINE_STATIC(LIST_HEAD(, bridge_softc), bridge_list); #define V_bridge_list VNET(bridge_list) #define BRIDGE_LIST_LOCK_INIT(x) sx_init(&V_bridge_list_sx, \ "if_bridge list") #define BRIDGE_LIST_LOCK_DESTROY(x) sx_destroy(&V_bridge_list_sx) #define BRIDGE_LIST_LOCK(x) sx_xlock(&V_bridge_list_sx) #define BRIDGE_LIST_UNLOCK(x) sx_xunlock(&V_bridge_list_sx) VNET_DEFINE_STATIC(struct if_clone *, bridge_cloner); #define V_bridge_cloner VNET(bridge_cloner) static const char bridge_name[] = "bridge"; static void vnet_bridge_init(const void *unused __unused) { V_bridge_rtnode_zone = uma_zcreate("bridge_rtnode", sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); BRIDGE_LIST_LOCK_INIT(); LIST_INIT(&V_bridge_list); struct if_clone_addreq req = { .create_f = bridge_clone_create, .destroy_f = bridge_clone_destroy, .flags = IFC_F_AUTOUNIT, }; V_bridge_cloner = ifc_attach_cloner(bridge_name, &req); } VNET_SYSINIT(vnet_bridge_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_bridge_init, NULL); static void vnet_bridge_uninit(const void *unused __unused) { ifc_detach_cloner(V_bridge_cloner); V_bridge_cloner = NULL; BRIDGE_LIST_LOCK_DESTROY(); /* Callbacks may use the UMA zone. */ NET_EPOCH_DRAIN_CALLBACKS(); uma_zdestroy(V_bridge_rtnode_zone); } VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_bridge_uninit, NULL); static int bridge_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: bridge_dn_p = bridge_dummynet; bridge_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bridge_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, bridge_detach_cookie); bridge_dn_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t bridge_mod = { "if_bridge", bridge_modevent, 0 }; DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_bridge, 1); MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1); /* * handler for net.link.bridge.ipfw */ static int sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS) { int enable = V_pfil_ipfw; int error; error = sysctl_handle_int(oidp, &enable, 0, req); enable &= 1; if (enable != V_pfil_ipfw) { V_pfil_ipfw = enable; /* * Disable pfil so that ipfw doesnt run twice, if the user * really wants both then they can re-enable pfil_bridge and/or * pfil_member. Also allow non-ip packets as ipfw can filter by * layer2 type. */ if (V_pfil_ipfw) { V_pfil_onlyip = 0; V_pfil_bridge = 0; V_pfil_member = 0; } } return (error); } SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET | CTLFLAG_NEEDGIANT, &VNET_NAME(pfil_ipfw), 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); #ifdef VIMAGE static void bridge_reassign(struct ifnet *ifp, struct vnet *newvnet, char *arg) { struct bridge_softc *sc = ifp->if_softc; struct bridge_iflist *bif; BRIDGE_LOCK(sc); while ((bif = CK_LIST_FIRST(&sc->sc_iflist)) != NULL) bridge_delete_member(sc, bif, 0); while ((bif = CK_LIST_FIRST(&sc->sc_spanlist)) != NULL) { bridge_delete_span(sc, bif); } BRIDGE_UNLOCK(sc); ether_reassign(ifp, newvnet, arg); } #endif /* * bridge_clone_create: * * Create a new bridge instance. */ static int bridge_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, struct ifnet **ifpp) { struct bridge_softc *sc; struct ifnet *ifp; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } BRIDGE_LOCK_INIT(sc); sc->sc_brtmax = BRIDGE_RTABLE_MAX; sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; /* Initialize our routing table. */ bridge_rtable_init(sc); callout_init_mtx(&sc->sc_brcallout, &sc->sc_rt_mtx, 0); CK_LIST_INIT(&sc->sc_iflist); CK_LIST_INIT(&sc->sc_spanlist); ifp->if_softc = sc; if_initname(ifp, bridge_name, ifd->unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = bridge_ioctl; #ifdef ALTQ ifp->if_start = bridge_altq_start; ifp->if_transmit = bridge_altq_transmit; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); #else ifp->if_transmit = bridge_transmit; #endif ifp->if_qflush = bridge_qflush; ifp->if_init = bridge_init; ifp->if_type = IFT_BRIDGE; ether_gen_addr(ifp, &sc->sc_defaddr); bstp_attach(&sc->sc_stp, &bridge_ops); ether_ifattach(ifp, sc->sc_defaddr.octet); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_BRIDGE; #ifdef VIMAGE ifp->if_reassign = bridge_reassign; #endif BRIDGE_LIST_LOCK(); LIST_INSERT_HEAD(&V_bridge_list, sc, sc_list); BRIDGE_LIST_UNLOCK(); *ifpp = ifp; return (0); } static void bridge_clone_destroy_cb(struct epoch_context *ctx) { struct bridge_softc *sc; sc = __containerof(ctx, struct bridge_softc, sc_epoch_ctx); BRIDGE_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } /* * bridge_clone_destroy: * * Destroy a bridge instance. */ static int bridge_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct bridge_softc *sc = ifp->if_softc; struct bridge_iflist *bif; struct epoch_tracker et; BRIDGE_LOCK(sc); bridge_stop(ifp, 1); ifp->if_flags &= ~IFF_UP; while ((bif = CK_LIST_FIRST(&sc->sc_iflist)) != NULL) bridge_delete_member(sc, bif, 0); while ((bif = CK_LIST_FIRST(&sc->sc_spanlist)) != NULL) { bridge_delete_span(sc, bif); } /* Tear down the routing table. */ bridge_rtable_fini(sc); BRIDGE_UNLOCK(sc); NET_EPOCH_ENTER(et); callout_drain(&sc->sc_brcallout); BRIDGE_LIST_LOCK(); LIST_REMOVE(sc, sc_list); BRIDGE_LIST_UNLOCK(); bstp_detach(&sc->sc_stp); #ifdef ALTQ IFQ_PURGE(&ifp->if_snd); #endif NET_EPOCH_EXIT(et); ether_ifdetach(ifp); if_free(ifp); NET_EPOCH_CALL(bridge_clone_destroy_cb, &sc->sc_epoch_ctx); return (0); } /* * bridge_ioctl: * * Handle a control request from the operator. */ static int bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct bridge_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct bridge_iflist *bif; struct thread *td = curthread; union { struct ifbreq ifbreq; struct ifbifconf ifbifconf; struct ifbareq ifbareq; struct ifbaconf ifbaconf; struct ifbrparam ifbrparam; struct ifbropreq ifbropreq; } args; struct ifdrv *ifd = (struct ifdrv *) data; const struct bridge_control *bc; int error = 0, oldmtu; BRIDGE_LOCK(sc); switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGDRVSPEC: case SIOCSDRVSPEC: if (ifd->ifd_cmd >= bridge_control_table_size) { error = EINVAL; break; } bc = &bridge_control_table[ifd->ifd_cmd]; if (cmd == SIOCGDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) == 0) { error = EINVAL; break; } else if (cmd == SIOCSDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) != 0) { error = EINVAL; break; } if (bc->bc_flags & BC_F_SUSER) { error = priv_check(td, PRIV_NET_BRIDGE); if (error) break; } if (ifd->ifd_len != bc->bc_argsize || ifd->ifd_len > sizeof(args)) { error = EINVAL; break; } bzero(&args, sizeof(args)); if (bc->bc_flags & BC_F_COPYIN) { error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) break; } oldmtu = ifp->if_mtu; error = (*bc->bc_func)(sc, &args); if (error) break; /* * Bridge MTU may change during addition of the first port. * If it did, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } if (bc->bc_flags & BC_F_COPYOUT) error = copyout(&args, ifd->ifd_data, ifd->ifd_len); break; case SIOCSIFFLAGS: if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ bridge_stop(ifp, 1); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ BRIDGE_UNLOCK(sc); (*ifp->if_init)(sc); BRIDGE_LOCK(sc); } break; case SIOCSIFMTU: oldmtu = sc->sc_ifp->if_mtu; if (ifr->ifr_mtu < 576) { error = EINVAL; break; } if (CK_LIST_EMPTY(&sc->sc_iflist)) { sc->sc_ifp->if_mtu = ifr->ifr_mtu; break; } CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { error = (*bif->bif_ifp->if_ioctl)(bif->bif_ifp, SIOCSIFMTU, (caddr_t)ifr); if (error != 0) { log(LOG_NOTICE, "%s: invalid MTU: %u for" " member %s\n", sc->sc_ifp->if_xname, ifr->ifr_mtu, bif->bif_ifp->if_xname); error = EINVAL; break; } } if (error) { /* Restore the previous MTU on all member interfaces. */ ifr->ifr_mtu = oldmtu; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { (*bif->bif_ifp->if_ioctl)(bif->bif_ifp, SIOCSIFMTU, (caddr_t)ifr); } } else { sc->sc_ifp->if_mtu = ifr->ifr_mtu; } break; default: /* * drop the lock as ether_ioctl() will call bridge_start() and * cause the lock to be recursed. */ BRIDGE_UNLOCK(sc); error = ether_ioctl(ifp, cmd, data); BRIDGE_LOCK(sc); break; } BRIDGE_UNLOCK(sc); return (error); } /* * bridge_mutecaps: * * Clear or restore unwanted capabilities on the member interface */ static void bridge_mutecaps(struct bridge_softc *sc) { struct bridge_iflist *bif; int enabled, mask; BRIDGE_LOCK_ASSERT(sc); /* Initial bitmask of capabilities to test */ mask = BRIDGE_IFCAPS_MASK; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { /* Every member must support it or its disabled */ mask &= bif->bif_savedcaps; } CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { enabled = bif->bif_ifp->if_capenable; enabled &= ~BRIDGE_IFCAPS_STRIP; /* strip off mask bits and enable them again if allowed */ enabled &= ~BRIDGE_IFCAPS_MASK; enabled |= mask; bridge_set_ifcap(sc, bif, enabled); } } static void bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) { struct ifnet *ifp = bif->bif_ifp; struct ifreq ifr; int error, mask, stuck; bzero(&ifr, sizeof(ifr)); ifr.ifr_reqcap = set; if (ifp->if_capenable != set) { error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr); if (error) if_printf(sc->sc_ifp, "error setting capabilities on %s: %d\n", ifp->if_xname, error); mask = BRIDGE_IFCAPS_MASK | BRIDGE_IFCAPS_STRIP; stuck = ifp->if_capenable & mask & ~set; if (stuck != 0) if_printf(sc->sc_ifp, "can't disable some capabilities on %s: 0x%x\n", ifp->if_xname, stuck); } } /* * bridge_lookup_member: * * Lookup a bridge member interface. */ static struct bridge_iflist * bridge_lookup_member(struct bridge_softc *sc, const char *name) { struct bridge_iflist *bif; struct ifnet *ifp; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { ifp = bif->bif_ifp; if (strcmp(ifp->if_xname, name) == 0) return (bif); } return (NULL); } /* * bridge_lookup_member_if: * * Lookup a bridge member interface by ifnet*. */ static struct bridge_iflist * bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) { struct bridge_iflist *bif; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp == member_ifp) return (bif); } return (NULL); } static void bridge_delete_member_cb(struct epoch_context *ctx) { struct bridge_iflist *bif; bif = __containerof(ctx, struct bridge_iflist, bif_epoch_ctx); free(bif, M_DEVBUF); } /* * bridge_delete_member: * * Delete the specified member interface. */ static void bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, int gone) { struct ifnet *ifs = bif->bif_ifp; struct ifnet *fif = NULL; struct bridge_iflist *bifl; BRIDGE_LOCK_ASSERT(sc); if (bif->bif_flags & IFBIF_STP) bstp_disable(&bif->bif_stp); ifs->if_bridge = NULL; CK_LIST_REMOVE(bif, bif_next); /* * If removing the interface that gave the bridge its mac address, set * the mac address of the bridge to the address of the next member, or * to its default address if no members are left. */ if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) { if (CK_LIST_EMPTY(&sc->sc_iflist)) { bcopy(&sc->sc_defaddr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = NULL; } else { bifl = CK_LIST_FIRST(&sc->sc_iflist); fif = bifl->bif_ifp; bcopy(IF_LLADDR(fif), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = fif; } EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } bridge_linkcheck(sc); bridge_mutecaps(sc); /* recalcuate now this interface is removed */ BRIDGE_RT_LOCK(sc); bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); BRIDGE_RT_UNLOCK(sc); KASSERT(bif->bif_addrcnt == 0, ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); ifs->if_bridge_output = NULL; ifs->if_bridge_input = NULL; ifs->if_bridge_linkstate = NULL; if (!gone) { switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: /* * Take the interface out of promiscuous mode, but only * if it was promiscuous in the first place. It might * not be if we're in the bridge_ioctl_add() error path. */ if (ifs->if_flags & IFF_PROMISC) (void) ifpromisc(ifs, 0); break; case IFT_GIF: break; default: #ifdef DIAGNOSTIC panic("bridge_delete_member: impossible"); #endif break; } /* reneable any interface capabilities */ bridge_set_ifcap(sc, bif, bif->bif_savedcaps); } bstp_destroy(&bif->bif_stp); /* prepare to free */ NET_EPOCH_CALL(bridge_delete_member_cb, &bif->bif_epoch_ctx); } /* * bridge_delete_span: * * Delete the specified span interface. */ static void bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif) { BRIDGE_LOCK_ASSERT(sc); KASSERT(bif->bif_ifp->if_bridge == NULL, ("%s: not a span interface", __func__)); CK_LIST_REMOVE(bif, bif_next); NET_EPOCH_CALL(bridge_delete_member_cb, &bif->bif_epoch_ctx); } static int bridge_ioctl_add(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif = NULL; struct ifnet *ifs; int error = 0; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); if (ifs->if_ioctl == NULL) /* must be supported */ return (EINVAL); /* If it's in the span list, it can't be a member. */ CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) return (EBUSY); if (ifs->if_bridge == sc) return (EEXIST); if (ifs->if_bridge != NULL) return (EBUSY); switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_GIF: /* permitted interface types */ break; default: return (EINVAL); } #ifdef INET6 /* * Two valid inet6 addresses with link-local scope must not be * on the parent interface and the member interfaces at the * same time. This restriction is needed to prevent violation * of link-local scope zone. Attempts to add a member * interface which has inet6 addresses when the parent has * inet6 triggers removal of all inet6 addresses on the member * interface. */ /* Check if the parent interface has a link-local scope addr. */ if (V_allow_llz_overlap == 0 && in6ifa_llaonifp(sc->sc_ifp) != NULL) { /* * If any, remove all inet6 addresses from the member * interfaces. */ CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (in6ifa_llaonifp(bif->bif_ifp)) { in6_ifdetach(bif->bif_ifp); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", bif->bif_ifp->if_xname); } } if (in6ifa_llaonifp(ifs)) { in6_ifdetach(ifs); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", ifs->if_xname); } } #endif /* Allow the first Ethernet member to define the MTU */ if (CK_LIST_EMPTY(&sc->sc_iflist)) sc->sc_ifp->if_mtu = ifs->if_mtu; else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { struct ifreq ifr; snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s", ifs->if_xname); ifr.ifr_mtu = sc->sc_ifp->if_mtu; error = (*ifs->if_ioctl)(ifs, SIOCSIFMTU, (caddr_t)&ifr); if (error != 0) { log(LOG_NOTICE, "%s: invalid MTU: %u for" " new member %s\n", sc->sc_ifp->if_xname, ifr.ifr_mtu, ifs->if_xname); return (EINVAL); } } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); bif->bif_ifp = ifs; bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; bif->bif_savedcaps = ifs->if_capenable; /* * Assign the interface's MAC address to the bridge if it's the first * member and the MAC address of the bridge has not been changed from * the default randomly generated one. */ if (V_bridge_inherit_mac && CK_LIST_EMPTY(&sc->sc_iflist) && !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr.octet, ETHER_ADDR_LEN)) { bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = ifs; EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } ifs->if_bridge = sc; ifs->if_bridge_output = bridge_output; ifs->if_bridge_input = bridge_input; ifs->if_bridge_linkstate = bridge_linkstate; bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); /* * XXX: XLOCK HERE!?! * * NOTE: insert_***HEAD*** should be safe for the traversals. */ CK_LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next); /* Set interface capabilities to the intersection set of all members */ bridge_mutecaps(sc); bridge_linkcheck(sc); /* Place the interface into promiscuous mode */ switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: error = ifpromisc(ifs, 1); break; } if (error) bridge_delete_member(sc, bif, 0); return (error); } static int bridge_ioctl_del(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bridge_delete_member(sc, bif, 0); return (0); } static int bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; struct bstp_port *bp; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bp = &bif->bif_stp; req->ifbr_ifsflags = bif->bif_flags; req->ifbr_state = bp->bp_state; req->ifbr_priority = bp->bp_priority; req->ifbr_path_cost = bp->bp_path_cost; req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; req->ifbr_proto = bp->bp_protover; req->ifbr_role = bp->bp_role; req->ifbr_stpflags = bp->bp_flags; req->ifbr_addrcnt = bif->bif_addrcnt; req->ifbr_addrmax = bif->bif_addrmax; req->ifbr_addrexceeded = bif->bif_addrexceeded; /* Copy STP state options as flags */ if (bp->bp_operedge) req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; if (bp->bp_flags & BSTP_PORT_AUTOEDGE) req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; if (bp->bp_ptp_link) req->ifbr_ifsflags |= IFBIF_BSTP_PTP; if (bp->bp_flags & BSTP_PORT_AUTOPTP) req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; if (bp->bp_flags & BSTP_PORT_ADMEDGE) req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; if (bp->bp_flags & BSTP_PORT_ADMCOST) req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; return (0); } static int bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) { struct epoch_tracker et; struct ifbreq *req = arg; struct bridge_iflist *bif; struct bstp_port *bp; int error; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bp = &bif->bif_stp; if (req->ifbr_ifsflags & IFBIF_SPAN) /* SPAN is readonly */ return (EINVAL); NET_EPOCH_ENTER(et); if (req->ifbr_ifsflags & IFBIF_STP) { if ((bif->bif_flags & IFBIF_STP) == 0) { error = bstp_enable(&bif->bif_stp); if (error) { NET_EPOCH_EXIT(et); return (error); } } } else { if ((bif->bif_flags & IFBIF_STP) != 0) bstp_disable(&bif->bif_stp); } /* Pass on STP flags */ bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0); bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0); bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0); bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0); /* Save the bits relating to the bridge */ bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK; NET_EPOCH_EXIT(et); return (0); } static int bridge_ioctl_scache(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; sc->sc_brtmax = param->ifbrp_csize; bridge_rttrim(sc); return (0); } static int bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_csize = sc->sc_brtmax; return (0); } static int bridge_ioctl_gifs(struct bridge_softc *sc, void *arg) { struct ifbifconf *bifc = arg; struct bridge_iflist *bif; struct ifbreq breq; char *buf, *outbuf; int count, buflen, len, error = 0; count = 0; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) count++; CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) count++; buflen = sizeof(breq) * count; if (bifc->ifbic_len == 0) { bifc->ifbic_len = buflen; return (0); } outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO); if (outbuf == NULL) return (ENOMEM); count = 0; buf = outbuf; len = min(bifc->ifbic_len, buflen); bzero(&breq, sizeof(breq)); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (len < sizeof(breq)) break; strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, sizeof(breq.ifbr_ifsname)); /* Fill in the ifbreq structure */ error = bridge_ioctl_gifflags(sc, &breq); if (error) break; memcpy(buf, &breq, sizeof(breq)); count++; buf += sizeof(breq); len -= sizeof(breq); } CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { if (len < sizeof(breq)) break; strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, sizeof(breq.ifbr_ifsname)); breq.ifbr_ifsflags = bif->bif_flags; breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; memcpy(buf, &breq, sizeof(breq)); count++; buf += sizeof(breq); len -= sizeof(breq); } bifc->ifbic_len = sizeof(breq) * count; error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_rts(struct bridge_softc *sc, void *arg) { struct ifbaconf *bac = arg; struct bridge_rtnode *brt; struct ifbareq bareq; char *buf, *outbuf; int count, buflen, len, error = 0; if (bac->ifbac_len == 0) return (0); count = 0; CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) count++; buflen = sizeof(bareq) * count; outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO); if (outbuf == NULL) return (ENOMEM); count = 0; buf = outbuf; len = min(bac->ifbac_len, buflen); bzero(&bareq, sizeof(bareq)); CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { if (len < sizeof(bareq)) goto out; strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname, sizeof(bareq.ifba_ifsname)); memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); bareq.ifba_vlan = brt->brt_vlan; if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && time_uptime < brt->brt_expire) bareq.ifba_expire = brt->brt_expire - time_uptime; else bareq.ifba_expire = 0; bareq.ifba_flags = brt->brt_flags; memcpy(buf, &bareq, sizeof(bareq)); count++; buf += sizeof(bareq); len -= sizeof(bareq); } out: bac->ifbac_len = sizeof(bareq) * count; error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_saddr(struct bridge_softc *sc, void *arg) { struct ifbareq *req = arg; struct bridge_iflist *bif; struct epoch_tracker et; int error; NET_EPOCH_ENTER(et); bif = bridge_lookup_member(sc, req->ifba_ifsname); if (bif == NULL) { NET_EPOCH_EXIT(et); return (ENOENT); } /* bridge_rtupdate() may acquire the lock. */ error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, req->ifba_flags); NET_EPOCH_EXIT(et); return (error); } static int bridge_ioctl_sto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; sc->sc_brttimeout = param->ifbrp_ctime; return (0); } static int bridge_ioctl_gto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_ctime = sc->sc_brttimeout; return (0); } static int bridge_ioctl_daddr(struct bridge_softc *sc, void *arg) { struct ifbareq *req = arg; return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); } static int bridge_ioctl_flush(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; BRIDGE_RT_LOCK(sc); bridge_rtflush(sc, req->ifbr_ifsflags); BRIDGE_RT_UNLOCK(sc); return (0); } static int bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_prio = bs->bs_bridge_priority; return (0); } static int bridge_ioctl_spri(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio)); } static int bridge_ioctl_ght(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_hellotime = bs->bs_bridge_htime >> 8; return (0); } static int bridge_ioctl_sht(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime)); } static int bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8; return (0); } static int bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay)); } static int bridge_ioctl_gma(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_maxage = bs->bs_bridge_max_age >> 8; return (0); } static int bridge_ioctl_sma(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage)); } static int bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); } static int bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); } static int bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bif->bif_addrmax = req->ifbr_addrmax; return (0); } static int bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif = NULL; struct ifnet *ifs; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) return (EBUSY); if (ifs->if_bridge != NULL) return (EBUSY); switch (ifs->if_type) { case IFT_ETHER: case IFT_GIF: case IFT_L2VLAN: break; default: return (EINVAL); } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); bif->bif_ifp = ifs; bif->bif_flags = IFBIF_SPAN; CK_LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next); return (0); } static int bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; struct ifnet *ifs; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) break; if (bif == NULL) return (ENOENT); bridge_delete_span(sc, bif); return (0); } static int bridge_ioctl_gbparam(struct bridge_softc *sc, void *arg) { struct ifbropreq *req = arg; struct bstp_state *bs = &sc->sc_stp; struct bstp_port *root_port; req->ifbop_maxage = bs->bs_bridge_max_age >> 8; req->ifbop_hellotime = bs->bs_bridge_htime >> 8; req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; root_port = bs->bs_root_port; if (root_port == NULL) req->ifbop_root_port = 0; else req->ifbop_root_port = root_port->bp_ifp->if_index; req->ifbop_holdcount = bs->bs_txholdcount; req->ifbop_priority = bs->bs_bridge_priority; req->ifbop_protocol = bs->bs_protover; req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; return (0); } static int bridge_ioctl_grte(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_cexceeded = sc->sc_brtexceeded; return (0); } static int bridge_ioctl_gifsstp(struct bridge_softc *sc, void *arg) { struct ifbpstpconf *bifstp = arg; struct bridge_iflist *bif; struct bstp_port *bp; struct ifbpstpreq bpreq; char *buf, *outbuf; int count, buflen, len, error = 0; count = 0; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if ((bif->bif_flags & IFBIF_STP) != 0) count++; } buflen = sizeof(bpreq) * count; if (bifstp->ifbpstp_len == 0) { bifstp->ifbpstp_len = buflen; return (0); } outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO); if (outbuf == NULL) return (ENOMEM); count = 0; buf = outbuf; len = min(bifstp->ifbpstp_len, buflen); bzero(&bpreq, sizeof(bpreq)); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (len < sizeof(bpreq)) break; if ((bif->bif_flags & IFBIF_STP) == 0) continue; bp = &bif->bif_stp; bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id; bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; memcpy(buf, &bpreq, sizeof(bpreq)); count++; buf += sizeof(bpreq); len -= sizeof(bpreq); } bifstp->ifbpstp_len = sizeof(bpreq) * count; error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto)); } static int bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc)); } /* * bridge_ifdetach: * * Detach an interface from a bridge. Called when a member * interface is detaching. */ static void bridge_ifdetach(void *arg __unused, struct ifnet *ifp) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif; if (ifp->if_flags & IFF_RENAMING) return; if (V_bridge_cloner == NULL) { /* * This detach handler can be called after * vnet_bridge_uninit(). Just return in that case. */ return; } /* Check if the interface is a bridge member */ if (sc != NULL) { BRIDGE_LOCK(sc); bif = bridge_lookup_member_if(sc, ifp); if (bif != NULL) bridge_delete_member(sc, bif, 1); BRIDGE_UNLOCK(sc); return; } /* Check if the interface is a span port */ BRIDGE_LIST_LOCK(); LIST_FOREACH(sc, &V_bridge_list, sc_list) { BRIDGE_LOCK(sc); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifp == bif->bif_ifp) { bridge_delete_span(sc, bif); break; } BRIDGE_UNLOCK(sc); } BRIDGE_LIST_UNLOCK(); } /* * bridge_init: * * Initialize a bridge interface. */ static void bridge_init(void *xsc) { struct bridge_softc *sc = (struct bridge_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; BRIDGE_LOCK(sc); callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ BRIDGE_UNLOCK(sc); } /* * bridge_stop: * * Stop the bridge interface. */ static void bridge_stop(struct ifnet *ifp, int disable) { struct bridge_softc *sc = ifp->if_softc; BRIDGE_LOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; BRIDGE_RT_LOCK(sc); callout_stop(&sc->sc_brcallout); bstp_stop(&sc->sc_stp); bridge_rtflush(sc, IFBF_FLUSHDYN); BRIDGE_RT_UNLOCK(sc); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * bridge_enqueue: * * Enqueue a packet on a bridge member interface. * */ static int bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) { int len, err = 0; short mflags; struct mbuf *m0; /* We may be sending a fragment so traverse the mbuf */ for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = NULL; len = m->m_pkthdr.len; mflags = m->m_flags; /* * If underlying interface can not do VLAN tag insertion itself * then attach a packet tag that holds it. */ if ((m->m_flags & M_VLANTAG) && (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if (m == NULL) { if_printf(dst_ifp, "unable to prepend VLAN header\n"); if_inc_counter(dst_ifp, IFCOUNTER_OERRORS, 1); continue; } m->m_flags &= ~M_VLANTAG; } M_ASSERTPKTHDR(m); /* We shouldn't transmit mbuf without pkthdr */ if ((err = dst_ifp->if_transmit(dst_ifp, m))) { int n; for (m = m0, n = 1; m != NULL; m = m0, n++) { m0 = m->m_nextpkt; m_freem(m); } if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, n); break; } if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len); if (mflags & M_MCAST) if_inc_counter(sc->sc_ifp, IFCOUNTER_OMCASTS, 1); } return (err); } /* * bridge_dummynet: * * Receive a queued packet from dummynet and pass it on to the output * interface. * * The mbuf has the Ethernet header already attached. */ static void bridge_dummynet(struct mbuf *m, struct ifnet *ifp) { struct bridge_softc *sc; sc = ifp->if_bridge; /* * The packet didnt originate from a member interface. This should only * ever happen if a member interface is removed while packets are * queued for it. */ if (sc == NULL) { m_freem(m); return; } if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) return; if (m == NULL) return; } bridge_enqueue(sc, ifp, m); } /* * bridge_output: * * Send output from a bridge member interface. This * performs the bridging function for locally originated * packets. * * The mbuf has the Ethernet header already attached. We must * enqueue or free the mbuf before returning. */ static int bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, struct rtentry *rt) { struct ether_header *eh; struct ifnet *bifp, *dst_if; struct bridge_softc *sc; uint16_t vlan; NET_EPOCH_ASSERT(); if (m->m_len < ETHER_HDR_LEN) { m = m_pullup(m, ETHER_HDR_LEN); if (m == NULL) return (0); } eh = mtod(m, struct ether_header *); sc = ifp->if_bridge; vlan = VLANTAGOF(m); bifp = sc->sc_ifp; /* * If bridge is down, but the original output interface is up, * go ahead and send out that interface. Otherwise, the packet * is dropped below. */ if ((bifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { dst_if = ifp; goto sendunicast; } /* * If the packet is a multicast, or we don't know a better way to * get there, send to all interfaces. */ if (ETHER_IS_MULTICAST(eh->ether_dhost)) dst_if = NULL; else dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); /* Tap any traffic not passing back out the originating interface */ if (dst_if != ifp) ETHER_BPF_MTAP(bifp, m); if (dst_if == NULL) { struct bridge_iflist *bif; struct mbuf *mc; int used = 0; bridge_span(sc, m); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { dst_if = bif->bif_ifp; if (dst_if->if_type == IFT_GIF) continue; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; /* * If this is not the original output interface, * and the interface is participating in spanning * tree, make sure the port is in a state that * allows forwarding. */ if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) continue; if (CK_LIST_NEXT(bif, bif_next) == NULL) { used = 1; mc = m; } else { mc = m_dup(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(bifp, IFCOUNTER_OERRORS, 1); continue; } } bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); return (0); } sendunicast: /* * XXX Spanning tree consideration here? */ bridge_span(sc, m); if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) { m_freem(m); return (0); } bridge_enqueue(sc, dst_if, m); return (0); } /* * bridge_transmit: * * Do output on a bridge. * */ static int bridge_transmit(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc; struct ether_header *eh; struct ifnet *dst_if; int error = 0; sc = ifp->if_softc; ETHER_BPF_MTAP(ifp, m); eh = mtod(m, struct ether_header *); if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) && (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1)) != NULL) { error = bridge_enqueue(sc, dst_if, m); } else bridge_broadcast(sc, ifp, m, 0); return (error); } #ifdef ALTQ static void bridge_altq_start(if_t ifp) { struct ifaltq *ifq = &ifp->if_snd; struct mbuf *m; IFQ_LOCK(ifq); IFQ_DEQUEUE_NOLOCK(ifq, m); while (m != NULL) { bridge_transmit(ifp, m); IFQ_DEQUEUE_NOLOCK(ifq, m); } IFQ_UNLOCK(ifq); } static int bridge_altq_transmit(if_t ifp, struct mbuf *m) { int err; if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_ENQUEUE(&ifp->if_snd, m, err); if (err == 0) bridge_altq_start(ifp); } else err = bridge_transmit(ifp, m); return (err); } #endif /* ALTQ */ /* * The ifp->if_qflush entry point for if_bridge(4) is no-op. */ static void bridge_qflush(struct ifnet *ifp __unused) { } /* * bridge_forward: * * The forwarding function of the bridge. * * NOTE: Releases the lock on return. */ static void bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, struct mbuf *m) { struct bridge_iflist *dbif; struct ifnet *src_if, *dst_if, *ifp; struct ether_header *eh; uint16_t vlan; uint8_t *dst; int error; NET_EPOCH_ASSERT(); src_if = m->m_pkthdr.rcvif; ifp = sc->sc_ifp; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); vlan = VLANTAGOF(m); if ((sbif->bif_flags & IFBIF_STP) && sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) goto drop; eh = mtod(m, struct ether_header *); dst = eh->ether_dhost; /* If the interface is learning, record the address. */ if (sbif->bif_flags & IFBIF_LEARNING) { error = bridge_rtupdate(sc, eh->ether_shost, vlan, sbif, 0, IFBAF_DYNAMIC); /* * If the interface has addresses limits then deny any source * that is not in the cache. */ if (error && sbif->bif_addrmax) goto drop; } if ((sbif->bif_flags & IFBIF_STP) != 0 && sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING) goto drop; /* * At this point, the port either doesn't participate * in spanning tree or it is in the forwarding state. */ /* * If the packet is unicast, destined for someone on * "this" side of the bridge, drop it. */ if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { dst_if = bridge_rtlookup(sc, dst, vlan); if (src_if == dst_if) goto drop; } else { /* * Check if its a reserved multicast address, any address * listed in 802.1D section 7.12.6 may not be forwarded by the * bridge. * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F */ if (dst[0] == 0x01 && dst[1] == 0x80 && dst[2] == 0xc2 && dst[3] == 0x00 && dst[4] == 0x00 && dst[5] <= 0x0f) goto drop; /* ...forward it to all interfaces. */ if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); dst_if = NULL; } /* * If we have a destination interface which is a member of our bridge, * OR this is a unicast packet, push it through the bpf(4) machinery. * For broadcast or multicast packets, don't bother because it will * be reinjected into ether_input. We do this before we pass the packets * through the pfil(9) framework, as it is possible that pfil(9) will * drop the packet, or possibly modify it, making it difficult to debug * firewall issues on the bridge. */ if (dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) ETHER_BPF_MTAP(ifp, m); /* run the packet filter */ if (PFIL_HOOKED_IN(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_IN(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0) return; if (m == NULL) return; } if (dst_if == NULL) { bridge_broadcast(sc, src_if, m, 1); return; } /* * At this point, we're dealing with a unicast frame * going to a different interface. */ if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) goto drop; dbif = bridge_lookup_member_if(sc, dst_if); if (dbif == NULL) /* Not a member of the bridge (anymore?) */ goto drop; /* Private segments can not talk to each other */ if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE) goto drop; if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) goto drop; if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) return; if (m == NULL) return; } bridge_enqueue(sc, dst_if, m); return; drop: m_freem(m); } /* * bridge_input: * * Receive input from a member interface. Queue the packet for * bridging if it is not for us. */ static struct mbuf * bridge_input(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif, *bif2; struct ifnet *bifp; struct ether_header *eh; struct mbuf *mc, *mc2; uint16_t vlan; int error; NET_EPOCH_ASSERT(); if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return (m); bifp = sc->sc_ifp; vlan = VLANTAGOF(m); /* * Implement support for bridge monitoring. If this flag has been * set on this interface, discard the packet once we push it through * the bpf(4) machinery, but before we do, increment the byte and * packet counters associated with this interface. */ if ((bifp->if_flags & IFF_MONITOR) != 0) { m->m_pkthdr.rcvif = bifp; ETHER_BPF_MTAP(bifp, m); if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return (NULL); } bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { return (m); } eh = mtod(m, struct ether_header *); bridge_span(sc, m); if (m->m_flags & (M_BCAST|M_MCAST)) { /* Tap off 802.1D packets; they do not get forwarded. */ if (memcmp(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN) == 0) { bstp_input(&bif->bif_stp, ifp, m); /* consumes mbuf */ return (NULL); } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { return (m); } /* * Make a deep copy of the packet and enqueue the copy * for bridge processing; return the original packet for * local processing. */ mc = m_dup(m, M_NOWAIT); if (mc == NULL) { return (m); } /* Perform the bridge forwarding function with the copy. */ bridge_forward(sc, bif, mc); /* * Reinject the mbuf as arriving on the bridge so we have a * chance at claiming multicast packets. We can not loop back * here from ether_input as a bridge is never a member of a * bridge. */ KASSERT(bifp->if_bridge == NULL, ("loop created in bridge_input")); mc2 = m_dup(m, M_NOWAIT); if (mc2 != NULL) { /* Keep the layer3 header aligned */ int i = min(mc2->m_pkthdr.len, max_protohdr); mc2 = m_copyup(mc2, i, ETHER_ALIGN); } if (mc2 != NULL) { mc2->m_pkthdr.rcvif = bifp; (*bifp->if_input)(bifp, mc2); } /* Return the original packet for local processing. */ return (m); } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { return (m); } #if (defined(INET) || defined(INET6)) # define OR_CARP_CHECK_WE_ARE_DST(iface) \ || ((iface)->if_carp \ && (*carp_forus_p)((iface), eh->ether_dhost)) # define OR_CARP_CHECK_WE_ARE_SRC(iface) \ || ((iface)->if_carp \ && (*carp_forus_p)((iface), eh->ether_shost)) #else # define OR_CARP_CHECK_WE_ARE_DST(iface) # define OR_CARP_CHECK_WE_ARE_SRC(iface) #endif #ifdef INET6 # define OR_PFIL_HOOKED_INET6 \ || PFIL_HOOKED_IN(V_inet6_pfil_head) #else # define OR_PFIL_HOOKED_INET6 #endif #define GRAB_OUR_PACKETS(iface) \ if ((iface)->if_type == IFT_GIF) \ continue; \ /* It is destined for us. */ \ if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_DST((iface)) \ ) { \ if (bif->bif_flags & IFBIF_LEARNING) { \ error = bridge_rtupdate(sc, eh->ether_shost, \ vlan, bif, 0, IFBAF_DYNAMIC); \ if (error && bif->bif_addrmax) { \ m_freem(m); \ return (NULL); \ } \ } \ m->m_pkthdr.rcvif = iface; \ if ((iface) == ifp) { \ /* Skip bridge processing... src == dest */ \ return (m); \ } \ /* It's passing over or to the bridge, locally. */ \ ETHER_BPF_MTAP(bifp, m); \ if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); \ if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ /* Filter on the physical interface. */ \ if (V_pfil_local_phys && (PFIL_HOOKED_IN(V_inet_pfil_head) \ OR_PFIL_HOOKED_INET6)) { \ if (bridge_pfil(&m, NULL, ifp, \ PFIL_IN) != 0 || m == NULL) { \ return (NULL); \ } \ } \ if ((iface) != bifp) \ ETHER_BPF_MTAP(iface, m); \ return (m); \ } \ \ /* We just received a packet that we sent out. */ \ if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_SRC((iface)) \ ) { \ m_freem(m); \ return (NULL); \ } /* * Unicast. Make sure it's not for the bridge. */ do { GRAB_OUR_PACKETS(bifp) } while (0); /* * Give a chance for ifp at first priority. This will help when the * packet comes through the interface like VLAN's with the same MACs * on several interfaces from the same bridge. This also will save * some CPU cycles in case the destination interface and the input * interface (eq ifp) are the same. */ do { GRAB_OUR_PACKETS(ifp) } while (0); /* Now check the all bridge members. */ CK_LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) { GRAB_OUR_PACKETS(bif2->bif_ifp) } #undef OR_CARP_CHECK_WE_ARE_DST #undef OR_CARP_CHECK_WE_ARE_SRC #undef OR_PFIL_HOOKED_INET6 #undef GRAB_OUR_PACKETS /* Perform the bridge forwarding function. */ bridge_forward(sc, bif, m); return (NULL); } /* * bridge_broadcast: * * Send a frame to all interfaces that are members of * the bridge, except for the one on which the packet * arrived. * * NOTE: Releases the lock on return. */ static void bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, struct mbuf *m, int runfilt) { struct bridge_iflist *dbif, *sbif; struct mbuf *mc; struct ifnet *dst_if; int used = 0, i; NET_EPOCH_ASSERT(); sbif = bridge_lookup_member_if(sc, src_if); /* Filter on the bridge interface before broadcasting */ if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) return; if (m == NULL) return; } CK_LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) { dst_if = dbif->bif_ifp; if (dst_if == src_if) continue; /* Private segments can not talk to each other */ if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)) continue; if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) continue; if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 && (m->m_flags & (M_BCAST|M_MCAST)) == 0) continue; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; if (CK_LIST_NEXT(dbif, bif_next) == NULL) { mc = m; used = 1; } else { mc = m_dup(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } /* * Filter on the output interface. Pass a NULL bridge interface * pointer so we do not redundantly filter on the bridge for * each interface we broadcast on. */ if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (used == 0) { /* Keep the layer3 header aligned */ i = min(mc->m_pkthdr.len, max_protohdr); mc = m_copyup(mc, i, ETHER_ALIGN); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) continue; if (mc == NULL) continue; } bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); } /* * bridge_span: * * Duplicate a packet out one or more interfaces that are in span mode, * the original mbuf is unmodified. */ static void bridge_span(struct bridge_softc *sc, struct mbuf *m) { struct bridge_iflist *bif; struct ifnet *dst_if; struct mbuf *mc; NET_EPOCH_ASSERT(); if (CK_LIST_EMPTY(&sc->sc_spanlist)) return; CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { dst_if = bif->bif_ifp; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; mc = m_dup(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } bridge_enqueue(sc, dst_if, mc); } } /* * bridge_rtupdate: * * Add a bridge routing entry. */ static int bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, struct bridge_iflist *bif, int setflags, uint8_t flags) { struct bridge_rtnode *brt; int error; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); /* Check the source address is valid and not multicast. */ if (ETHER_IS_MULTICAST(dst) || (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) return (EINVAL); /* 802.1p frames map to vlan 1 */ if (vlan == 0) vlan = 1; /* * A route for this destination might already exist. If so, * update it, otherwise create a new one. */ if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) { BRIDGE_RT_LOCK(sc); /* Check again, now that we have the lock. There could have * been a race and we only want to insert this once. */ if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) != NULL) { BRIDGE_RT_UNLOCK(sc); return (0); } if (sc->sc_brtcnt >= sc->sc_brtmax) { sc->sc_brtexceeded++; BRIDGE_RT_UNLOCK(sc); return (ENOSPC); } /* Check per interface address limits (if enabled) */ if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) { bif->bif_addrexceeded++; BRIDGE_RT_UNLOCK(sc); return (ENOSPC); } /* * Allocate a new bridge forwarding node, and * initialize the expiration time and Ethernet * address. */ brt = uma_zalloc(V_bridge_rtnode_zone, M_NOWAIT | M_ZERO); if (brt == NULL) { BRIDGE_RT_UNLOCK(sc); return (ENOMEM); } brt->brt_vnet = curvnet; if (bif->bif_flags & IFBIF_STICKY) brt->brt_flags = IFBAF_STICKY; else brt->brt_flags = IFBAF_DYNAMIC; memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN); brt->brt_vlan = vlan; if ((error = bridge_rtnode_insert(sc, brt)) != 0) { uma_zfree(V_bridge_rtnode_zone, brt); BRIDGE_RT_UNLOCK(sc); return (error); } brt->brt_dst = bif; bif->bif_addrcnt++; BRIDGE_RT_UNLOCK(sc); } if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && brt->brt_dst != bif) { BRIDGE_RT_LOCK(sc); brt->brt_dst->bif_addrcnt--; brt->brt_dst = bif; brt->brt_dst->bif_addrcnt++; BRIDGE_RT_UNLOCK(sc); } if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) brt->brt_expire = time_uptime + sc->sc_brttimeout; if (setflags) brt->brt_flags = flags; return (0); } /* * bridge_rtlookup: * * Lookup the destination interface for an address. */ static struct ifnet * bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; NET_EPOCH_ASSERT(); if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL) return (NULL); return (brt->brt_ifp); } /* * bridge_rttrim: * * Trim the routine table so that we have a number * of routing entries less than or equal to the * maximum number. */ static void bridge_rttrim(struct bridge_softc *sc) { struct bridge_rtnode *brt, *nbrt; NET_EPOCH_ASSERT(); BRIDGE_RT_LOCK_ASSERT(sc); /* Make sure we actually need to do this. */ if (sc->sc_brtcnt <= sc->sc_brtmax) return; /* Force an aging cycle; this might trim enough addresses. */ bridge_rtage(sc); if (sc->sc_brtcnt <= sc->sc_brtmax) return; CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { bridge_rtnode_destroy(sc, brt); if (sc->sc_brtcnt <= sc->sc_brtmax) return; } } } /* * bridge_timer: * * Aging timer for the bridge. */ static void bridge_timer(void *arg) { struct bridge_softc *sc = arg; BRIDGE_RT_LOCK_ASSERT(sc); /* Destruction of rtnodes requires a proper vnet context */ CURVNET_SET(sc->sc_ifp->if_vnet); bridge_rtage(sc); if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); CURVNET_RESTORE(); } /* * bridge_rtage: * * Perform an aging cycle. */ static void bridge_rtage(struct bridge_softc *sc) { struct bridge_rtnode *brt, *nbrt; BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { if (time_uptime >= brt->brt_expire) bridge_rtnode_destroy(sc, brt); } } } /* * bridge_rtflush: * * Remove all dynamic addresses from the bridge. */ static void bridge_rtflush(struct bridge_softc *sc, int full) { struct bridge_rtnode *brt, *nbrt; BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) bridge_rtnode_destroy(sc, brt); } } /* * bridge_rtdaddr: * * Remove an address from the table. */ static int bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; int found = 0; BRIDGE_RT_LOCK(sc); /* * If vlan is zero then we want to delete for all vlans so the lookup * may return more than one. */ while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) { bridge_rtnode_destroy(sc, brt); found = 1; } BRIDGE_RT_UNLOCK(sc); return (found ? 0 : ENOENT); } /* * bridge_rtdelete: * * Delete routes to a speicifc member interface. */ static void bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) { struct bridge_rtnode *brt, *nbrt; BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if (brt->brt_ifp == ifp && (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) bridge_rtnode_destroy(sc, brt); } } /* * bridge_rtable_init: * * Initialize the route table for this bridge. */ static void bridge_rtable_init(struct bridge_softc *sc) { int i; sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, M_DEVBUF, M_WAITOK); for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) CK_LIST_INIT(&sc->sc_rthash[i]); sc->sc_rthash_key = arc4random(); CK_LIST_INIT(&sc->sc_rtlist); } /* * bridge_rtable_fini: * * Deconstruct the route table for this bridge. */ static void bridge_rtable_fini(struct bridge_softc *sc) { KASSERT(sc->sc_brtcnt == 0, ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt)); free(sc->sc_rthash, M_DEVBUF); } /* * The following hash function is adapted from "Hash Functions" by Bob Jenkins * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). */ #define mix(a, b, c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (/*CONSTCOND*/0) static __inline uint32_t bridge_rthash(struct bridge_softc *sc, const uint8_t *addr) { uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key; b += addr[5] << 8; b += addr[4]; a += addr[3] << 24; a += addr[2] << 16; a += addr[1] << 8; a += addr[0]; mix(a, b, c); return (c & BRIDGE_RTHASH_MASK); } #undef mix static int bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b) { int i, d; for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { d = ((int)a[i]) - ((int)b[i]); } return (d); } /* * bridge_rtnode_lookup: * * Look up a bridge route node for the specified destination. Compare the * vlan id or if zero then just return the first match. */ static struct bridge_rtnode * bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; uint32_t hash; int dir; BRIDGE_RT_LOCK_OR_NET_EPOCH_ASSERT(sc); hash = bridge_rthash(sc, addr); CK_LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr); if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0)) return (brt); if (dir > 0) return (NULL); } return (NULL); } /* * bridge_rtnode_insert: * * Insert the specified bridge node into the route table. We * assume the entry is not already in the table. */ static int bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) { struct bridge_rtnode *lbrt; uint32_t hash; int dir; BRIDGE_RT_LOCK_ASSERT(sc); hash = bridge_rthash(sc, brt->brt_addr); lbrt = CK_LIST_FIRST(&sc->sc_rthash[hash]); if (lbrt == NULL) { CK_LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash); goto out; } do { dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) return (EEXIST); if (dir > 0) { CK_LIST_INSERT_BEFORE(lbrt, brt, brt_hash); goto out; } if (CK_LIST_NEXT(lbrt, brt_hash) == NULL) { CK_LIST_INSERT_AFTER(lbrt, brt, brt_hash); goto out; } lbrt = CK_LIST_NEXT(lbrt, brt_hash); } while (lbrt != NULL); #ifdef DIAGNOSTIC panic("bridge_rtnode_insert: impossible"); #endif out: CK_LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list); sc->sc_brtcnt++; return (0); } static void bridge_rtnode_destroy_cb(struct epoch_context *ctx) { struct bridge_rtnode *brt; brt = __containerof(ctx, struct bridge_rtnode, brt_epoch_ctx); CURVNET_SET(brt->brt_vnet); uma_zfree(V_bridge_rtnode_zone, brt); CURVNET_RESTORE(); } /* * bridge_rtnode_destroy: * * Destroy a bridge rtnode. */ static void bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) { BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_REMOVE(brt, brt_hash); CK_LIST_REMOVE(brt, brt_list); sc->sc_brtcnt--; brt->brt_dst->bif_addrcnt--; NET_EPOCH_CALL(bridge_rtnode_destroy_cb, &brt->brt_epoch_ctx); } /* * bridge_rtable_expire: * * Set the expiry time for all routes on an interface. */ static void bridge_rtable_expire(struct ifnet *ifp, int age) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_rtnode *brt; CURVNET_SET(ifp->if_vnet); BRIDGE_RT_LOCK(sc); /* * If the age is zero then flush, otherwise set all the expiry times to * age for the interface */ if (age == 0) bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN); else { CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { /* Cap the expiry time to 'age' */ if (brt->brt_ifp == ifp && brt->brt_expire > time_uptime + age && (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) brt->brt_expire = time_uptime + age; } } BRIDGE_RT_UNLOCK(sc); CURVNET_RESTORE(); } /* * bridge_state_change: * * Callback from the bridgestp code when a port changes states. */ static void bridge_state_change(struct ifnet *ifp, int state) { struct bridge_softc *sc = ifp->if_bridge; static const char *stpstates[] = { "disabled", "listening", "learning", "forwarding", "blocking", "discarding" }; CURVNET_SET(ifp->if_vnet); if (V_log_stp) log(LOG_NOTICE, "%s: state changed to %s on %s\n", sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname); CURVNET_RESTORE(); } /* * Send bridge packets through pfil if they are one of the types pfil can deal * with, or if they are ARP or REVARP. (pfil will pass ARP and REVARP without * question.) If *bifp or *ifp are NULL then packet filtering is skipped for * that interface. */ static int bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) { int snap, error, i, hlen; struct ether_header *eh1, eh2; struct ip *ip; struct llc llc1; u_int16_t ether_type; pfil_return_t rv; snap = 0; error = -1; /* Default error if not error == 0 */ #if 0 /* we may return with the IP fields swapped, ensure its not shared */ KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__)); #endif if (V_pfil_bridge == 0 && V_pfil_member == 0 && V_pfil_ipfw == 0) return (0); /* filtering is disabled */ i = min((*mp)->m_pkthdr.len, max_protohdr); if ((*mp)->m_len < i) { *mp = m_pullup(*mp, i); if (*mp == NULL) { printf("%s: m_pullup failed\n", __func__); return (-1); } } eh1 = mtod(*mp, struct ether_header *); ether_type = ntohs(eh1->ether_type); /* * Check for SNAP/LLC. */ if (ether_type < ETHERMTU) { struct llc *llc2 = (struct llc *)(eh1 + 1); if ((*mp)->m_len >= ETHER_HDR_LEN + 8 && llc2->llc_dsap == LLC_SNAP_LSAP && llc2->llc_ssap == LLC_SNAP_LSAP && llc2->llc_control == LLC_UI) { ether_type = htons(llc2->llc_un.type_snap.ether_type); snap = 1; } } /* * If we're trying to filter bridge traffic, don't look at anything * other than IP and ARP traffic. If the filter doesn't understand * IPv6, don't allow IPv6 through the bridge either. This is lame * since if we really wanted, say, an AppleTalk filter, we are hosed, * but of course we don't have an AppleTalk filter to begin with. * (Note that since pfil doesn't understand ARP it will pass *ALL* * ARP traffic.) */ switch (ether_type) { case ETHERTYPE_ARP: case ETHERTYPE_REVARP: if (V_pfil_ipfw_arp == 0) return (0); /* Automatically pass */ break; case ETHERTYPE_IP: #ifdef INET6 case ETHERTYPE_IPV6: #endif /* INET6 */ break; default: /* * Check to see if the user wants to pass non-ip * packets, these will not be checked by pfil(9) and * passed unconditionally so the default is to drop. */ if (V_pfil_onlyip) goto bad; } /* Run the packet through pfil before stripping link headers */ if (PFIL_HOOKED_OUT(V_link_pfil_head) && V_pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) { - switch (pfil_run_hooks(V_link_pfil_head, mp, ifp, dir, NULL)) { + switch (pfil_mbuf_out(V_link_pfil_head, mp, ifp, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } } /* Strip off the Ethernet header and keep a copy. */ m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); m_adj(*mp, ETHER_HDR_LEN); /* Strip off snap header, if present */ if (snap) { m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1); m_adj(*mp, sizeof(struct llc)); } /* * Check the IP header for alignment and errors */ if (dir == PFIL_IN) { switch (ether_type) { case ETHERTYPE_IP: error = bridge_ip_checkbasic(mp); break; #ifdef INET6 case ETHERTYPE_IPV6: error = bridge_ip6_checkbasic(mp); break; #endif /* INET6 */ default: error = 0; } if (error) goto bad; } error = 0; /* * Run the packet through pfil */ rv = PFIL_PASS; switch (ether_type) { case ETHERTYPE_IP: /* * Run pfil on the member interface and the bridge, both can * be skipped by clearing pfil_member or pfil_bridge. * * Keep the order: * in_if -> bridge_if -> out_if */ if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = - pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != + pfil_mbuf_out(V_inet_pfil_head, mp, bifp, NULL)) != PFIL_PASS) break; - if (V_pfil_member && ifp != NULL && (rv = - pfil_run_hooks(V_inet_pfil_head, mp, ifp, dir, NULL)) != - PFIL_PASS) - break; + if (V_pfil_member && ifp != NULL) { + rv = (dir == PFIL_OUT) ? + pfil_mbuf_out(V_inet_pfil_head, mp, ifp, NULL) : + pfil_mbuf_in(V_inet_pfil_head, mp, ifp, NULL); + if (rv != PFIL_PASS) + break; + } if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = - pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != + pfil_mbuf_in(V_inet_pfil_head, mp, bifp, NULL)) != PFIL_PASS) break; /* check if we need to fragment the packet */ /* bridge_fragment generates a mbuf chain of packets */ /* that already include eth headers */ if (V_pfil_member && ifp != NULL && dir == PFIL_OUT) { i = (*mp)->m_pkthdr.len; if (i > ifp->if_mtu) { error = bridge_fragment(ifp, mp, &eh2, snap, &llc1); return (error); } } /* Recalculate the ip checksum. */ ip = mtod(*mp, struct ip *); hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) goto bad; if (hlen > (*mp)->m_len) { if ((*mp = m_pullup(*mp, hlen)) == NULL) goto bad; ip = mtod(*mp, struct ip *); if (ip == NULL) goto bad; } ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(*mp, hlen); break; #ifdef INET6 case ETHERTYPE_IPV6: if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = - pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != + pfil_mbuf_out(V_inet6_pfil_head, mp, bifp, NULL)) != PFIL_PASS) break; - if (V_pfil_member && ifp != NULL && (rv = - pfil_run_hooks(V_inet6_pfil_head, mp, ifp, dir, NULL)) != - PFIL_PASS) - break; + if (V_pfil_member && ifp != NULL) { + rv = (dir == PFIL_OUT) ? + pfil_mbuf_out(V_inet6_pfil_head, mp, ifp, NULL) : + pfil_mbuf_in(V_inet6_pfil_head, mp, ifp, NULL); + if (rv != PFIL_PASS) + break; + } if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = - pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != + pfil_mbuf_in(V_inet6_pfil_head, mp, bifp, NULL)) != PFIL_PASS) break; break; #endif } switch (rv) { case PFIL_CONSUMED: return (0); case PFIL_DROPPED: return (EACCES); default: break; } error = -1; /* * Finally, put everything back the way it was and return */ if (snap) { M_PREPEND(*mp, sizeof(struct llc), M_NOWAIT); if (*mp == NULL) return (error); bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); } M_PREPEND(*mp, ETHER_HDR_LEN, M_NOWAIT); if (*mp == NULL) return (error); bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); return (0); bad: m_freem(*mp); *mp = NULL; return (error); } /* * Perform basic checks on header size since * pfil assumes ip_input has already processed * it for it. Cut-and-pasted from ip_input.c. * Given how simple the IPv6 version is, * does the IPv4 version really need to be * this complicated? * * XXX Should we update ipstat here, or not? * XXX Right now we update ipstat but not * XXX csum_counter. */ static int bridge_ip_checkbasic(struct mbuf **mp) { struct mbuf *m = *mp; struct ip *ip; int len, hlen; u_short sum; if (*mp == NULL) return (-1); if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { if ((m = m_copyup(m, sizeof(struct ip), (max_linkhdr + 3) & ~3)) == NULL) { /* XXXJRT new stat, please */ KMOD_IPSTAT_INC(ips_toosmall); goto bad; } } else if (__predict_false(m->m_len < sizeof (struct ip))) { if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { KMOD_IPSTAT_INC(ips_toosmall); goto bad; } } ip = mtod(m, struct ip *); if (ip == NULL) goto bad; if (ip->ip_v != IPVERSION) { KMOD_IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ KMOD_IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { KMOD_IPSTAT_INC(ips_badhlen); goto bad; } ip = mtod(m, struct ip *); if (ip == NULL) goto bad; } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { KMOD_IPSTAT_INC(ips_badsum); goto bad; } /* Retrieve the packet length. */ len = ntohs(ip->ip_len); /* * Check for additional length bogosity */ if (len < hlen) { KMOD_IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < len) { KMOD_IPSTAT_INC(ips_tooshort); goto bad; } /* Checks out, proceed */ *mp = m; return (0); bad: *mp = m; return (-1); } #ifdef INET6 /* * Same as above, but for IPv6. * Cut-and-pasted from ip6_input.c. * XXX Should we update ip6stat, or not? */ static int bridge_ip6_checkbasic(struct mbuf **mp) { struct mbuf *m = *mp; struct ip6_hdr *ip6; /* * If the IPv6 header is not aligned, slurp it up into a new * mbuf with space for link headers, in the event we forward * it. Otherwise, if it is aligned, make sure the entire base * IPv6 header is in the first mbuf of the chain. */ if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { struct ifnet *inifp = m->m_pkthdr.rcvif; if ((m = m_copyup(m, sizeof(struct ip6_hdr), (max_linkhdr + 3) & ~3)) == NULL) { /* XXXJRT new stat, please */ IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { struct ifnet *inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } /* Checks out, proceed */ *mp = m; return (0); bad: *mp = m; return (-1); } #endif /* INET6 */ /* * bridge_fragment: * * Fragment mbuf chain in multiple packets and prepend ethernet header. */ static int bridge_fragment(struct ifnet *ifp, struct mbuf **mp, struct ether_header *eh, int snap, struct llc *llc) { struct mbuf *m = *mp, *nextpkt = NULL, *mprev = NULL, *mcur = NULL; struct ip *ip; int error = -1; if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == NULL) goto dropit; ip = mtod(m, struct ip *); m->m_pkthdr.csum_flags |= CSUM_IP; error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist); if (error) goto dropit; /* * Walk the chain and re-add the Ethernet header for * each mbuf packet. */ for (mcur = m; mcur; mcur = mcur->m_nextpkt) { nextpkt = mcur->m_nextpkt; mcur->m_nextpkt = NULL; if (snap) { M_PREPEND(mcur, sizeof(struct llc), M_NOWAIT); if (mcur == NULL) { error = ENOBUFS; if (mprev != NULL) mprev->m_nextpkt = nextpkt; goto dropit; } bcopy(llc, mtod(mcur, caddr_t),sizeof(struct llc)); } M_PREPEND(mcur, ETHER_HDR_LEN, M_NOWAIT); if (mcur == NULL) { error = ENOBUFS; if (mprev != NULL) mprev->m_nextpkt = nextpkt; goto dropit; } bcopy(eh, mtod(mcur, caddr_t), ETHER_HDR_LEN); /* * The previous two M_PREPEND could have inserted one or two * mbufs in front so we have to update the previous packet's * m_nextpkt. */ mcur->m_nextpkt = nextpkt; if (mprev != NULL) mprev->m_nextpkt = mcur; else { /* The first mbuf in the original chain needs to be * updated. */ *mp = mcur; } mprev = mcur; } KMOD_IPSTAT_INC(ips_fragmented); return (error); dropit: for (mcur = *mp; mcur; mcur = m) { /* droping the full packet chain */ m = mcur->m_nextpkt; m_freem(mcur); } return (error); } static void bridge_linkstate(struct ifnet *ifp) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif; struct epoch_tracker et; NET_EPOCH_ENTER(et); bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { NET_EPOCH_EXIT(et); return; } bridge_linkcheck(sc); bstp_linkstate(&bif->bif_stp); NET_EPOCH_EXIT(et); } static void bridge_linkcheck(struct bridge_softc *sc) { struct bridge_iflist *bif; int new_link, hasls; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); new_link = LINK_STATE_DOWN; hasls = 0; /* Our link is considered up if at least one of our ports is active */ CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp->if_capabilities & IFCAP_LINKSTATE) hasls++; if (bif->bif_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } if (!CK_LIST_EMPTY(&sc->sc_iflist) && !hasls) { /* If no interfaces support link-state then we default to up */ new_link = LINK_STATE_UP; } if_link_state_change(sc->sc_ifp, new_link); } diff --git a/sys/net/if_enc.c b/sys/net/if_enc.c index da6ce7a1a815..b5ea1c68692c 100644 --- a/sys/net/if_enc.c +++ b/sys/net/if_enc.c @@ -1,450 +1,454 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 The FreeBSD Project. * Copyright (c) 2015 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #define ENCMTU (1024+512) /* XXX this define must have the same value as in OpenBSD */ #define M_CONF 0x0400 /* payload was encrypted (ESP-transport) */ #define M_AUTH 0x0800 /* payload was authenticated (AH or ESP auth) */ #define M_AUTH_AH 0x2000 /* header was authenticated (AH) */ struct enchdr { u_int32_t af; u_int32_t spi; u_int32_t flags; }; struct enc_softc { struct ifnet *sc_ifp; }; VNET_DEFINE_STATIC(struct enc_softc *, enc_sc); #define V_enc_sc VNET(enc_sc) VNET_DEFINE_STATIC(struct if_clone *, enc_cloner); #define V_enc_cloner VNET(enc_cloner) static int enc_ioctl(struct ifnet *, u_long, caddr_t); static int enc_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int enc_clone_create(struct if_clone *, int, caddr_t); static void enc_clone_destroy(struct ifnet *); static int enc_add_hhooks(struct enc_softc *); static void enc_remove_hhooks(struct enc_softc *); static const char encname[] = "enc"; #define IPSEC_ENC_AFTER_PFIL 0x04 /* * Before and after are relative to when we are stripping the * outer IP header. * * AFTER_PFIL flag used only for bpf_mask_*. It enables BPF capturing * after PFIL hook execution. It might be useful when PFIL hook does * some changes to the packet, e.g. address translation. If PFIL hook * consumes mbuf, nothing will be captured. */ VNET_DEFINE_STATIC(int, filter_mask_in) = IPSEC_ENC_BEFORE; VNET_DEFINE_STATIC(int, bpf_mask_in) = IPSEC_ENC_BEFORE; VNET_DEFINE_STATIC(int, filter_mask_out) = IPSEC_ENC_BEFORE; VNET_DEFINE_STATIC(int, bpf_mask_out) = IPSEC_ENC_BEFORE | IPSEC_ENC_AFTER; #define V_filter_mask_in VNET(filter_mask_in) #define V_bpf_mask_in VNET(bpf_mask_in) #define V_filter_mask_out VNET(filter_mask_out) #define V_bpf_mask_out VNET(bpf_mask_out) static SYSCTL_NODE(_net, OID_AUTO, enc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "enc sysctl"); static SYSCTL_NODE(_net_enc, OID_AUTO, in, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "enc input sysctl"); static SYSCTL_NODE(_net_enc, OID_AUTO, out, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "enc output sysctl"); SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_filter_mask, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_in), 0, "IPsec input firewall filter mask"); SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_bpf_mask, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_in), 0, "IPsec input bpf mask"); SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_filter_mask, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_out), 0, "IPsec output firewall filter mask"); SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_bpf_mask, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_out), 0, "IPsec output bpf mask"); static void enc_clone_destroy(struct ifnet *ifp) { struct enc_softc *sc; sc = ifp->if_softc; KASSERT(sc == V_enc_sc, ("sc != ifp->if_softc")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_DEVBUF); V_enc_sc = NULL; } static int enc_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct ifnet *ifp; struct enc_softc *sc; sc = malloc(sizeof(struct enc_softc), M_DEVBUF, M_WAITOK | M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ENC); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } if (V_enc_sc != NULL) { if_free(ifp); free(sc, M_DEVBUF); return (EEXIST); } V_enc_sc = sc; if_initname(ifp, encname, unit); ifp->if_mtu = ENCMTU; ifp->if_ioctl = enc_ioctl; ifp->if_output = enc_output; ifp->if_softc = sc; if_attach(ifp); bpfattach(ifp, DLT_ENC, sizeof(struct enchdr)); return (0); } static int enc_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { m_freem(m); return (0); } static int enc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { if (cmd != SIOCSIFFLAGS) return (EINVAL); if (ifp->if_flags & IFF_UP) ifp->if_drv_flags |= IFF_DRV_RUNNING; else ifp->if_drv_flags &= ~IFF_DRV_RUNNING; return (0); } static void enc_bpftap(struct ifnet *ifp, struct mbuf *m, const struct secasvar *sav, int32_t hhook_type, uint8_t enc, uint8_t af) { struct enchdr hdr; if (hhook_type == HHOOK_TYPE_IPSEC_IN && (enc & V_bpf_mask_in) == 0) return; else if (hhook_type == HHOOK_TYPE_IPSEC_OUT && (enc & V_bpf_mask_out) == 0) return; if (bpf_peers_present(ifp->if_bpf) == 0) return; hdr.af = af; hdr.spi = sav->spi; hdr.flags = 0; if (sav->alg_enc != SADB_EALG_NONE) hdr.flags |= M_CONF; if (sav->alg_auth != SADB_AALG_NONE) hdr.flags |= M_AUTH; bpf_mtap2(ifp->if_bpf, &hdr, sizeof(hdr), m); } /* * One helper hook function is used by any hook points. * + from hhook_type we can determine the packet direction: * HHOOK_TYPE_IPSEC_IN or HHOOK_TYPE_IPSEC_OUT; * + from hhook_id we can determine address family: AF_INET or AF_INET6; * + udata contains pointer to enc_softc; * + ctx_data contains pointer to struct ipsec_ctx_data. */ static int enc_hhook(int32_t hhook_type, int32_t hhook_id, void *udata, void *ctx_data, void *hdata, struct osd *hosd) { struct ipsec_ctx_data *ctx; struct enc_softc *sc; struct ifnet *ifp, *rcvif; struct pfil_head *ph; - int pdir; + int pdir, ret; sc = (struct enc_softc *)udata; ifp = sc->sc_ifp; if ((ifp->if_flags & IFF_UP) == 0) return (0); ctx = (struct ipsec_ctx_data *)ctx_data; /* XXX: wrong hook point was used by caller? */ if (ctx->af != hhook_id) return (EPFNOSUPPORT); enc_bpftap(ifp, *ctx->mp, ctx->sav, hhook_type, ctx->enc, ctx->af); switch (hhook_type) { case HHOOK_TYPE_IPSEC_IN: if (ctx->enc == IPSEC_ENC_BEFORE) { /* Do accounting only once */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, (*ctx->mp)->m_pkthdr.len); } if ((ctx->enc & V_filter_mask_in) == 0) return (0); /* skip pfil processing */ pdir = PFIL_IN; break; case HHOOK_TYPE_IPSEC_OUT: if (ctx->enc == IPSEC_ENC_BEFORE) { /* Do accounting only once */ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, (*ctx->mp)->m_pkthdr.len); } if ((ctx->enc & V_filter_mask_out) == 0) return (0); /* skip pfil processing */ pdir = PFIL_OUT; break; default: return (EINVAL); } switch (hhook_id) { #ifdef INET case AF_INET: ph = V_inet_pfil_head; break; #endif #ifdef INET6 case AF_INET6: ph = V_inet6_pfil_head; break; #endif default: ph = NULL; } if (ph == NULL || (pdir == PFIL_OUT && !PFIL_HOOKED_OUT(ph)) || (pdir == PFIL_IN && !PFIL_HOOKED_IN(ph))) return (0); /* Make a packet looks like it was received on enc(4) */ rcvif = (*ctx->mp)->m_pkthdr.rcvif; (*ctx->mp)->m_pkthdr.rcvif = ifp; - if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, ctx->inp) != PFIL_PASS) { + if (pdir == PFIL_IN) + ret = pfil_mbuf_in(ph, ctx->mp, ifp, ctx->inp); + else + ret = pfil_mbuf_out(ph, ctx->mp, ifp, ctx->inp); + if (ret != PFIL_PASS) { *ctx->mp = NULL; /* consumed by filter */ return (EACCES); } (*ctx->mp)->m_pkthdr.rcvif = rcvif; enc_bpftap(ifp, *ctx->mp, ctx->sav, hhook_type, IPSEC_ENC_AFTER_PFIL, ctx->af); return (0); } static int enc_add_hhooks(struct enc_softc *sc) { struct hookinfo hki; int error; error = EPFNOSUPPORT; hki.hook_func = enc_hhook; hki.hook_helper = NULL; hki.hook_udata = sc; #ifdef INET hki.hook_id = AF_INET; hki.hook_type = HHOOK_TYPE_IPSEC_IN; error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET], &hki, HHOOK_WAITOK); if (error != 0) return (error); hki.hook_type = HHOOK_TYPE_IPSEC_OUT; error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET], &hki, HHOOK_WAITOK); if (error != 0) return (error); #endif #ifdef INET6 hki.hook_id = AF_INET6; hki.hook_type = HHOOK_TYPE_IPSEC_IN; error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6], &hki, HHOOK_WAITOK); if (error != 0) return (error); hki.hook_type = HHOOK_TYPE_IPSEC_OUT; error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6], &hki, HHOOK_WAITOK); if (error != 0) return (error); #endif return (error); } static void enc_remove_hhooks(struct enc_softc *sc) { struct hookinfo hki; hki.hook_func = enc_hhook; hki.hook_helper = NULL; hki.hook_udata = sc; #ifdef INET hki.hook_id = AF_INET; hki.hook_type = HHOOK_TYPE_IPSEC_IN; hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET], &hki); hki.hook_type = HHOOK_TYPE_IPSEC_OUT; hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET], &hki); #endif #ifdef INET6 hki.hook_id = AF_INET6; hki.hook_type = HHOOK_TYPE_IPSEC_IN; hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6], &hki); hki.hook_type = HHOOK_TYPE_IPSEC_OUT; hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6], &hki); #endif } static void vnet_enc_init(const void *unused __unused) { V_enc_sc = NULL; V_enc_cloner = if_clone_simple(encname, enc_clone_create, enc_clone_destroy, 1); } VNET_SYSINIT(vnet_enc_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_enc_init, NULL); static void vnet_enc_init_proto(void *unused __unused) { KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc)); if (enc_add_hhooks(V_enc_sc) != 0) enc_clone_destroy(V_enc_sc->sc_ifp); } VNET_SYSINIT(vnet_enc_init_proto, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_enc_init_proto, NULL); static void vnet_enc_uninit(const void *unused __unused) { KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc)); if_clone_detach(V_enc_cloner); } VNET_SYSUNINIT(vnet_enc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_enc_uninit, NULL); /* * The hhook consumer needs to go before ip[6]_destroy are called on * SI_ORDER_THIRD. */ static void vnet_enc_uninit_hhook(const void *unused __unused) { KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc)); enc_remove_hhooks(V_enc_sc); } VNET_SYSUNINIT(vnet_enc_uninit_hhook, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_enc_uninit_hhook, NULL); static int enc_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t enc_mod = { "if_enc", enc_modevent, 0 }; DECLARE_MODULE(if_enc, enc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_enc, 1); diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 8bc66497e161..34ff4ac22e7f 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -1,1500 +1,1499 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_ethernet_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); static int ether_requestencap(struct ifnet *, struct if_encap_req *); #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct ether_header *eh; struct arphdr *ah; uint16_t etype; const u_char *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < ETHER_HDR_LEN) return (ENOMEM); eh = (struct ether_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); req->bufsize = sizeof(struct ether_header); return (0); } static int ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, u_char *phdr, uint32_t *pflags, struct llentry **plle) { uint32_t lleflags = 0; int error = 0; #if defined(INET) || defined(INET6) struct ether_header *eh = (struct ether_header *)phdr; uint16_t etype; #endif if (plle) *plle = NULL; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { if (m->m_flags & M_BCAST) memcpy(eh->ether_dhost, ifp->if_broadcastaddr, ETHER_ADDR_LEN); else { const struct in_addr *a; a = &(((const struct sockaddr_in *)dst)->sin_addr); ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); } etype = htons(ETHERTYPE_IP); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) == 0) { int af = RO_GET_FAMILY(ro, dst); error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr, &lleflags, plle); } else { const struct in6_addr *a6; a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); etype = htons(ETHERTYPE_IPV6); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int error = 0; char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; bool loop_copy; int hlen; /* link layer header length */ uint32_t pflags; struct llentry *lle = NULL; int addref = 0; phdr = NULL; pflags = 0; if (ro != NULL) { /* XXX BPF uses ro_prepend */ if (ro->ro_prepend != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { if ((ro->ro_flags & RT_LLE_CACHE) != 0) { lle = ro->ro_lle; if (lle != NULL && (lle->la_flags & LLE_VALID) == 0) { LLE_FREE(lle); lle = NULL; /* redundant */ ro->ro_lle = NULL; } if (lle == NULL) { /* if we lookup, keep cache */ addref = 1; } else /* * Notify LLE code that * the entry was used * by datapath. */ llentry_provide_feedback(lle); } if (lle != NULL) { phdr = lle->r_linkdata; hlen = lle->r_hdrlen; pflags = lle->r_flags; } } } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = ETHER_HDR_LEN; error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags, addref ? &lle : NULL); if (addref && lle != NULL) ro->ro_lle = lle; if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0)); } loop_copy = (pflags & RT_MAY_LOOP) != 0; /* * Add local net header. If no space in first mbuf, * allocate another. * * Note that we do prepend regardless of RT_HAS_HEADER flag. * This is done because BPF code shifts m_data pointer * to the end of ethernet header prior to calling if_output(). */ M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); if ((pflags & RT_HAS_HEADER) == 0) { eh = mtod(m, struct ether_header *); memcpy(eh, phdr, hlen); } /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, RO_GET_FAMILY(ro, dst), hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } static bool ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) { struct ether_8021q_tag qtag; struct ether_header *eh; eh = mtod(*mp, struct ether_header *); if (ntohs(eh->ether_type) == ETHERTYPE_VLAN || ntohs(eh->ether_type) == ETHERTYPE_QINQ) return (true); qtag.vid = 0; qtag.pcp = pcp; qtag.proto = ETHERTYPE_VLAN; if (ether_8021q_frame(mp, ifp, ifp, &qtag)) return (true); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (false); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { uint8_t pcp; pcp = ifp->if_pcp; if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN && !ether_set_pcp(&m, ifp, pcp)) return (0); if (PFIL_HOOKED_OUT(V_link_pfil_head)) - switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, - NULL)) { + switch (pfil_mbuf_out(V_link_pfil_head, &m, ifp, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ break; }; } #endif #endif /* * Queue message on interface, update output statistics if successful, * and start output if interface not yet active. * * If KMSAN is enabled, use it to verify that the data does not contain * any uninitialized bytes. */ kmsan_check_mbuf(m, "ether_output"); return ((ifp->if_transmit)(ifp, m)); } /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { switch (etype) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return; /* NOTREACHED */ break; }; } #endif #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_ethernet_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_ethernet_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. */ if ((m->m_flags & M_VLANTAG) == 0 && ((etype == ETHERTYPE_VLAN) || (etype == ETHERTYPE_QINQ))) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { struct pfil_head_args args; args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_ETHERNET; args.pa_headname = PFIL_ETHER_NAME; V_link_pfil_head = pfil_head_register(&args); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); #ifdef VIMAGE static void vnet_ether_pfil_destroy(__unused void *arg) { pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); static void vnet_ether_destroy(__unused void *arg) { netisr_unregister_vnet(ðer_nh); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); #endif static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct mbuf *mn; bool needs_epoch; needs_epoch = !(ifp->if_flags & IFF_KNOWSEPOCH); /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ CURVNET_SET_QUIET(ifp->if_vnet); if (__predict_false(needs_epoch)) NET_EPOCH_ENTER(et); while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* * We will rely on rcvif being set properly in the deferred * context, so assert it is correct here. */ MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); netisr_dispatch(NETISR_ETHER, m); m = mn; } if (__predict_false(needs_epoch)) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; NET_EPOCH_ASSERT(); KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { - i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); + i = pfil_mbuf_in(V_link_pfil_head, &m, ifp, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } /* Strip off Ethernet header. */ m_adj(m, ETHER_HDR_LEN); netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; ifp->if_mtu = ETHERMTU; if_attach(ifp); ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (ifp->if_hw_addr != NULL) bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(ether_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ETHER_ADDR_LEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; case SIOCSLANPCP: error = priv_check(curthread, PRIV_NET_SETLANPCP); if (error != 0) break; if (ifr->ifr_lan_pcp > 7 && ifr->ifr_lan_pcp != IFNET_PCP_NONE) { error = EINVAL; } else { ifp->if_pcp = ifr->ifr_lan_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); } break; case SIOCGLANPCP: ifr->ifr_lan_pcp = ifp->if_pcp; break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static moduledata_t ether_mod = { .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap_proto(struct mbuf *m, uint16_t tag, uint16_t proto) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(proto); evl->evl_tag = htons(tag); return (m); } void ether_bpf_mtap_if(struct ifnet *ifp, struct mbuf *m) { if (bpf_peers_present(ifp->if_bpf)) { M_ASSERTVALID(m); if ((m->m_flags & M_VLANTAG) != 0) ether_vlan_mtap(ifp->if_bpf, m, NULL, 0); else bpf_mtap(ifp->if_bpf, m); } } static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IEEE 802.1Q VLAN"); static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "for consistency"); VNET_DEFINE_STATIC(int, soft_pad); #define V_soft_pad VNET(soft_pad) SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(soft_pad), 0, "pad short frames before tagging"); /* * For now, make preserving PCP via an mbuf tag optional, as it increases * per-packet memory allocations and frees. In the future, it would be * preferable to reuse ether_vtag for this, or similar. */ VNET_DEFINE(int, vlan_mtag_pcp) = 0; #define V_vlan_mtag_pcp VNET(vlan_mtag_pcp) SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(vlan_mtag_pcp), 0, "Retain VLAN PCP information as packets are passed up the stack"); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, struct ether_8021q_tag *qtag) { struct m_tag *mtag; int n; uint16_t tag; static const char pad[8]; /* just zeros */ /* * Pad the frame to the minimum size allowed if told to. * This option is in accord with IEEE Std 802.1Q, 2003 Ed., * paragraph C.4.4.3.b. It can help to work around buggy * bridges that violate paragraph C.4.4.3.a from the same * document, i.e., fail to pad short frames after untagging. * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but * untagging it will produce a 62-byte frame, which is a runt * and requires padding. There are VLAN-enabled network * devices that just discard such runts instead or mishandle * them somehow. */ if (V_soft_pad && p->if_type == IFT_ETHER) { for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len; n > 0; n -= sizeof(pad)) { if (!m_append(*mp, min(n, sizeof(pad)), pad)) break; } if (n > 0) { m_freem(*mp); *mp = NULL; if_printf(ife, "cannot pad short frame"); return (false); } } /* * If PCP is set in mbuf, use it */ if ((*mp)->m_flags & M_VLANTAG) { qtag->pcp = EVL_PRIOFTAG((*mp)->m_pkthdr.ether_vtag); } /* * If underlying interface can do VLAN tag insertion itself, * just pass the packet along. However, we need some way to * tell the interface where the packet came from so that it * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ if (V_vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL)) != NULL) tag = EVL_MAKETAG(qtag->vid, *(uint8_t *)(mtag + 1), 0); else tag = EVL_MAKETAG(qtag->vid, qtag->pcp, 0); if ((p->if_capenable & IFCAP_VLAN_HWTAGGING) && (qtag->proto == ETHERTYPE_VLAN)) { (*mp)->m_pkthdr.ether_vtag = tag; (*mp)->m_flags |= M_VLANTAG; } else { *mp = ether_vlanencap_proto(*mp, tag, qtag->proto); if (*mp == NULL) { if_printf(ife, "unable to prepend 802.1Q header"); return (false); } } return (true); } /* * Allocate an address from the FreeBSD Foundation OUI. This uses a * cryptographic hash function on the containing jail's name, UUID and the * interface name to attempt to provide a unique but stable address. * Pseudo-interfaces which require a MAC address should use this function to * allocate non-locally-administered addresses. */ void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) { SHA1_CTX ctx; char *buf; char uuid[HOSTUUIDLEN + 1]; uint64_t addr; int i, sz; char digest[SHA1_RESULTLEN]; char jailname[MAXHOSTNAMELEN]; getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); if (strncmp(uuid, DEFAULT_HOSTUUID, sizeof(uuid)) == 0) { /* Fall back to a random mac address. */ goto rando; } /* If each (vnet) jail would also have a unique hostuuid this would not * be necessary. */ getjailname(curthread->td_ucred, jailname, sizeof(jailname)); sz = asprintf(&buf, M_TEMP, "%s-%s-%s", uuid, if_name(ifp), jailname); if (sz < 0) { /* Fall back to a random mac address. */ goto rando; } SHA1Init(&ctx); SHA1Update(&ctx, buf, sz); SHA1Final(digest, &ctx); free(buf, M_TEMP); addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & OUI_FREEBSD_GENERATED_MASK; addr = OUI_FREEBSD(addr); for (i = 0; i < ETHER_ADDR_LEN; ++i) { hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & 0xFF; } return; rando: arc4rand(hwaddr, sizeof(*hwaddr), 0); /* Unicast */ hwaddr->octet[0] &= 0xFE; /* Locally administered. */ hwaddr->octet[0] |= 0x02; } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 739138a6f791..e62935b247da 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -1,1596 +1,1595 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_mbuf_stress_test.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SCTP) || defined(SCTP_SUPPORT) #include #include #endif #include #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; static inline int ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags, struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; - int pflags = PFIL_OUT; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; - switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) { + switch (pfil_mbuf_out(V_inet_pfil_head, mp, ifp, inp)) { case PFIL_DROPPED: *error = EACCES; /* FALLTHROUGH */ case PFIL_CONSUMED: return 1; /* Finished */ case PFIL_PASS: *error = 0; } m = *mp; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; return -1; /* Reloop */ } /* See if fib was changed by packet filter. */ if ((*fibnum) != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; *fibnum = M_GETFIB(m); return -1; /* Reloop for FIB change */ } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); return -1; /* Reloop for CHANGE of dst */ } return 0; } static int ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, const struct sockaddr *gw, struct route *ro, bool stamp_tag) { #ifdef KERN_TLS struct ktls_session *tls = NULL; #endif struct m_snd_tag *mst; int error; MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mst = NULL; #ifdef KERN_TLS /* * If this is an unencrypted TLS record, save a reference to * the record. This local reference is used to call * ktls_output_eagain after the mbuf has been freed (thus * dropping the mbuf's reference) in if_output. */ if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { tls = ktls_hold(m->m_next->m_epg_tls); mst = tls->snd_tag; /* * If a TLS session doesn't have a valid tag, it must * have had an earlier ifp mismatch, so drop this * packet. */ if (mst == NULL) { m_freem(m); error = EAGAIN; goto done; } /* * Always stamp tags that include NIC ktls. */ stamp_tag = true; } #endif #ifdef RATELIMIT if (inp != NULL && mst == NULL) { if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp)) in_pcboutput_txrtlmt(inp, ifp, m); if (inp->inp_snd_tag != NULL) mst = inp->inp_snd_tag; } #endif if (stamp_tag && mst != NULL) { KASSERT(m->m_pkthdr.rcvif == NULL, ("trying to add a send tag to a forwarded packet")); if (mst->ifp != ifp) { m_freem(m); error = EAGAIN; goto done; } /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } error = (*ifp->if_output)(ifp, m, gw, ro); done: /* Check for route change invalidating send tags. */ #ifdef KERN_TLS if (tls != NULL) { if (error == EAGAIN) error = ktls_output_eagain(inp, tls); ktls_free(tls); } #endif #ifdef RATELIMIT if (error == EAGAIN) in_pcboutput_eagain(inp); #endif return (error); } /* rte<>ro_flags translation */ static inline void rt_update_ro_flags(struct route *ro, const struct nhop_object *nh) { int nh_flags = nh->nh_flags; ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW); ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0; ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0; ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0; } /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu = 0; int error = 0; int vlan_pcp = -1; struct sockaddr_in *dst; const struct sockaddr *gw; struct in_ifaddr *ia = NULL; struct in_addr src; int isbroadcast; uint16_t ip_len, ip_off; struct route iproute; uint32_t fibnum; #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); NET_EPOCH_ASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } if ((inp->inp_flags2 & INP_2PCP_SET) != 0) vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } if ((flags & IP_FORWARDING) == 0) IPSTAT_INC(ips_localout); /* * dst/gw handling: * * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } dst = (struct sockaddr_in *)&ro->ro_dst; if (ro->ro_nh == NULL) { dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } gw = (const struct sockaddr *)dst; again: /* * Validate route against routing table additions; * a better/more specific route might have been added. */ if (inp != NULL && ro->ro_nh != NULL) NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. * Also check whether routing cache needs invalidation. */ if (ro->ro_nh != NULL && ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) RO_INVALIDATE_CACHE(ro); ia = NULL; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = 1; src = IA_SIN(ia)->sin_addr; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? in_ifaddr_broadcast(dst->sin_addr, ia) : 0; src = IA_SIN(ia)->sin_addr; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; mtu = ifp->if_mtu; IFP_TO_IA(ifp, ia); isbroadcast = 0; /* fool gcc */ /* Interface may have no addresses. */ if (ia != NULL) src = IA_SIN(ia)->sin_addr; else src.s_addr = INADDR_ANY; } else if (ro != &iproute) { if (ro->ro_nh == NULL) { /* * We want to do any cloning requested by the link * layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ uint32_t flowid; flowid = m->m_pkthdr.flowid; ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_REF, flowid); if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } } struct nhop_object *nh = ro->ro_nh; ia = ifatoia(nh->nh_ifa); ifp = nh->nh_ifp; counter_u64_add(nh->nh_pksent, 1); rt_update_ro_flags(ro, nh); if (nh->nh_flags & NHF_GATEWAY) gw = &nh->gw_sa; if (nh->nh_flags & NHF_HOST) isbroadcast = (nh->nh_flags & NHF_BROADCAST); else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET)) isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia); else isbroadcast = 0; mtu = nh->nh_mtu; src = IA_SIN(ia)->sin_addr; } else { struct nhop_object *nh; nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, NHR_NONE, m->m_pkthdr.flowid); if (nh == NULL) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ifp = nh->nh_ifp; mtu = nh->nh_mtu; rt_update_ro_flags(ro, nh); if (nh->nh_flags & NHF_GATEWAY) gw = &nh->gw_sa; ia = ifatoia(nh->nh_ifa); src = IA_SIN(ia)->sin_addr; isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) == (NHF_HOST | NHF_BROADCAST)) || ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET) && in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia))); } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p", __func__, mtu, ro, (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = (const struct sockaddr *)dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (PFIL_HOOKED_OUT(V_inet_pfil_head)) { switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum, &error)) { case 1: /* Finished */ goto done; case 0: /* Continue normally */ ip = mtod(m, struct ip *); break; case -1: /* Need to try again */ /* Reset everything for a new round */ if (ro != NULL) { RO_NHFREE(ro); ro->ro_prepend = NULL; } gw = (const struct sockaddr *)dst; ip = mtod(m, struct ip *); goto again; } } if (vlan_pcp > -1) EVL_APPLY_PRI(m, vlan_pcp); /* IN_LOOPBACK must not appear on the wire - RFC1122. */ if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } /* Ensure the packet data is mapped if the interface requires it. */ if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) { m = mb_unmapped_to_ext(m); if (m == NULL) { IPSTAT_INC(ips_odropped); error = ENOBUFS; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. * Note that if_vxlan could have requested TSO even though the outer * frame is UDP. It is correct to not fragment such datagrams and * instead just pass them on to the driver. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & (CSUM_TSO | CSUM_INNER_TSO)) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO)) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = ip_output_send(inp, ifp, m, gw, ro, (flags & IP_NO_SND_TAG_RL) ? false : true); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); error = ip_output_send(inp, ifp, m, gw, ro, true); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* * Packet shall not have "Don't Fragment" flag and have at least 8 * bytes of payload. */ if (__predict_false((ip_off & IP_DF) || len < 8)) { IPSTAT_INC(ips_cantfrag); return (EMSGSIZE); } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; struct udphdr *uh; uint16_t cklen, csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; if (m->m_pkthdr.csum_flags & CSUM_UDP) { /* if udp header is not in the first mbuf copy udplen */ if (offset + sizeof(struct udphdr) > m->m_len) { m_copydata(m, offset + offsetof(struct udphdr, uh_ulen), sizeof(cklen), (caddr_t)&cklen); cklen = ntohs(cklen); } else { uh = (struct udphdr *)mtodo(m, offset); cklen = ntohs(uh->uh_ulen); } csum = in_cksum_skip(m, cklen + offset, offset); if (csum == 0) csum = 0xffff; } else { cklen = ntohs(ip->ip_len); csum = in_cksum_skip(m, cklen, offset); } offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) inp->inp_flags2 |= INP_REUSEPORT_LB; else inp->inp_flags2 &= ~INP_REUSEPORT_LB; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif case IP_VLAN_PCP: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IP_VLAN_PCP: if ((optval >= -1) && (optval <= (INP_2PCP_MASK >> INP_2PCP_SHIFT))) { if (optval == -1) { INP_WLOCK(inp); inp->inp_flags2 &= ~(INP_2PCP_SET | INP_2PCP_MASK); INP_WUNLOCK(inp); } else { INP_WLOCK(inp); inp->inp_flags2 |= INP_2PCP_SET; inp->inp_flags2 &= ~INP_2PCP_MASK; inp->inp_flags2 |= optval << INP_2PCP_SHIFT; INP_WUNLOCK(inp); } } else error = EINVAL; break; } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: INP_RLOCK(inp); if (inp->inp_options) { struct mbuf *options; options = m_copym(inp->inp_options, 0, M_COPYALL, M_NOWAIT); INP_RUNLOCK(inp); if (options != NULL) { error = sooptcopyout(sopt, mtod(options, char *), options->m_len); m_freem(options); } else error = ENOMEM; } else { INP_RUNLOCK(inp); sopt->sopt_valsize = 0; } break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif case IP_VLAN_PCP: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; case IP_VLAN_PCP: if (OPTBIT2(INP_2PCP_SET)) { optval = (inp->inp_flags2 & INP_2PCP_MASK) >> INP_2PCP_SHIFT; } else { optval = -1; } break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); if_simloop(ifp, copym, AF_INET, 0); } }