diff --git a/sys/dev/usb/usb_pf.c b/sys/dev/usb/usb_pf.c index 6ccb5ebbc62b..d0b7f0889cea 100644 --- a/sys/dev/usb/usb_pf.c +++ b/sys/dev/usb/usb_pf.c @@ -1,534 +1,541 @@ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef USB_GLOBAL_INCLUDE_FILE #include USB_GLOBAL_INCLUDE_FILE #else #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* USB_GLOBAL_INCLUDE_FILE */ static void usbpf_init(void *); static void usbpf_uninit(void *); static int usbpf_ioctl(struct ifnet *, u_long, caddr_t); static int usbpf_clone_match(struct if_clone *, const char *); -static int usbpf_clone_create(struct if_clone *, char *, size_t, caddr_t); -static int usbpf_clone_destroy(struct if_clone *, struct ifnet *); +static int usbpf_clone_create(struct if_clone *, char *, size_t, + struct ifc_data *, struct ifnet **); +static int usbpf_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static struct usb_bus *usbpf_ifname2ubus(const char *); static uint32_t usbpf_aggregate_xferflags(struct usb_xfer_flags *); static uint32_t usbpf_aggregate_status(struct usb_xfer_flags_int *); static int usbpf_xfer_frame_is_read(struct usb_xfer *, uint32_t); static uint32_t usbpf_xfer_precompute_size(struct usb_xfer *, int); static struct if_clone *usbpf_cloner; static const char usbusname[] = "usbus"; SYSINIT(usbpf_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, usbpf_init, NULL); SYSUNINIT(usbpf_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, usbpf_uninit, NULL); static void usbpf_init(void *arg) { + struct if_clone_addreq req = { + .match_f = usbpf_clone_match, + .create_f = usbpf_clone_create, + .destroy_f = usbpf_clone_destroy, + }; - usbpf_cloner = if_clone_advanced(usbusname, 0, usbpf_clone_match, - usbpf_clone_create, usbpf_clone_destroy); + usbpf_cloner = ifc_attach_cloner(usbusname, &req); } static void usbpf_uninit(void *arg) { int devlcnt; device_t *devlp; devclass_t dc; struct usb_bus *ubus; int error; int i; if_clone_detach(usbpf_cloner); dc = devclass_find(usbusname); if (dc == NULL) return; error = devclass_get_devices(dc, &devlp, &devlcnt); if (error) return; for (i = 0; i < devlcnt; i++) { ubus = device_get_softc(devlp[i]); if (ubus != NULL && ubus->ifp != NULL) - usbpf_clone_destroy(usbpf_cloner, ubus->ifp); + usbpf_clone_destroy(usbpf_cloner, ubus->ifp, 0); } free(devlp, M_TEMP); } static int usbpf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { /* No configuration allowed. */ return (EINVAL); } static struct usb_bus * usbpf_ifname2ubus(const char *ifname) { device_t dev; devclass_t dc; int unit; int error; if (strncmp(ifname, usbusname, sizeof(usbusname) - 1) != 0) return (NULL); error = ifc_name2unit(ifname, &unit); if (error || unit < 0) return (NULL); dc = devclass_find(usbusname); if (dc == NULL) return (NULL); dev = devclass_get_device(dc, unit); if (dev == NULL) return (NULL); return (device_get_softc(dev)); } static int usbpf_clone_match(struct if_clone *ifc, const char *name) { struct usb_bus *ubus; ubus = usbpf_ifname2ubus(name); if (ubus == NULL) return (0); if (ubus->ifp != NULL) return (0); return (1); } static int -usbpf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +usbpf_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { int error; int unit; struct ifnet *ifp; struct usb_bus *ubus; error = ifc_name2unit(name, &unit); if (error) return (error); if (unit < 0) return (EINVAL); ubus = usbpf_ifname2ubus(name); if (ubus == NULL) return (1); if (ubus->ifp != NULL) return (1); error = ifc_alloc_unit(ifc, &unit); if (error) { device_printf(ubus->parent, "usbpf: Could not allocate " "instance\n"); return (error); } ifp = ubus->ifp = if_alloc(IFT_USB); if (ifp == NULL) { ifc_free_unit(ifc, unit); device_printf(ubus->parent, "usbpf: Could not allocate " "instance\n"); return (ENOSPC); } strlcpy(ifp->if_xname, name, sizeof(ifp->if_xname)); ifp->if_softc = ubus; ifp->if_dname = usbusname; ifp->if_dunit = unit; ifp->if_ioctl = usbpf_ioctl; if_attach(ifp); ifp->if_flags |= IFF_UP; rt_ifmsg(ifp); /* * XXX According to the specification of DLT_USB, it indicates * packets beginning with USB setup header. But not sure all * packets would be. */ bpfattach(ifp, DLT_USB, USBPF_HDR_LEN); + *ifpp = ifp; return (0); } static int -usbpf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +usbpf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct usb_bus *ubus; int unit; ubus = ifp->if_softc; unit = ifp->if_dunit; /* * Lock USB before clearing the "ifp" pointer, to avoid * clearing the pointer in the middle of a TAP operation: */ USB_BUS_LOCK(ubus); ubus->ifp = NULL; USB_BUS_UNLOCK(ubus); bpfdetach(ifp); if_detach(ifp); if_free(ifp); ifc_free_unit(ifc, unit); return (0); } void usbpf_attach(struct usb_bus *ubus) { if (bootverbose) device_printf(ubus->parent, "usbpf: Attached\n"); } void usbpf_detach(struct usb_bus *ubus) { if (ubus->ifp != NULL) - usbpf_clone_destroy(usbpf_cloner, ubus->ifp); + usbpf_clone_destroy(usbpf_cloner, ubus->ifp, 0); if (bootverbose) device_printf(ubus->parent, "usbpf: Detached\n"); } static uint32_t usbpf_aggregate_xferflags(struct usb_xfer_flags *flags) { uint32_t val = 0; if (flags->force_short_xfer == 1) val |= USBPF_FLAG_FORCE_SHORT_XFER; if (flags->short_xfer_ok == 1) val |= USBPF_FLAG_SHORT_XFER_OK; if (flags->short_frames_ok == 1) val |= USBPF_FLAG_SHORT_FRAMES_OK; if (flags->pipe_bof == 1) val |= USBPF_FLAG_PIPE_BOF; if (flags->proxy_buffer == 1) val |= USBPF_FLAG_PROXY_BUFFER; if (flags->ext_buffer == 1) val |= USBPF_FLAG_EXT_BUFFER; if (flags->manual_status == 1) val |= USBPF_FLAG_MANUAL_STATUS; if (flags->no_pipe_ok == 1) val |= USBPF_FLAG_NO_PIPE_OK; if (flags->stall_pipe == 1) val |= USBPF_FLAG_STALL_PIPE; return (val); } static uint32_t usbpf_aggregate_status(struct usb_xfer_flags_int *flags) { uint32_t val = 0; if (flags->open == 1) val |= USBPF_STATUS_OPEN; if (flags->transferring == 1) val |= USBPF_STATUS_TRANSFERRING; if (flags->did_dma_delay == 1) val |= USBPF_STATUS_DID_DMA_DELAY; if (flags->did_close == 1) val |= USBPF_STATUS_DID_CLOSE; if (flags->draining == 1) val |= USBPF_STATUS_DRAINING; if (flags->started == 1) val |= USBPF_STATUS_STARTED; if (flags->bandwidth_reclaimed == 1) val |= USBPF_STATUS_BW_RECLAIMED; if (flags->control_xfr == 1) val |= USBPF_STATUS_CONTROL_XFR; if (flags->control_hdr == 1) val |= USBPF_STATUS_CONTROL_HDR; if (flags->control_act == 1) val |= USBPF_STATUS_CONTROL_ACT; if (flags->control_stall == 1) val |= USBPF_STATUS_CONTROL_STALL; if (flags->short_frames_ok == 1) val |= USBPF_STATUS_SHORT_FRAMES_OK; if (flags->short_xfer_ok == 1) val |= USBPF_STATUS_SHORT_XFER_OK; #if USB_HAVE_BUSDMA if (flags->bdma_enable == 1) val |= USBPF_STATUS_BDMA_ENABLE; if (flags->bdma_no_post_sync == 1) val |= USBPF_STATUS_BDMA_NO_POST_SYNC; if (flags->bdma_setup == 1) val |= USBPF_STATUS_BDMA_SETUP; #endif if (flags->isochronous_xfr == 1) val |= USBPF_STATUS_ISOCHRONOUS_XFR; if (flags->curr_dma_set == 1) val |= USBPF_STATUS_CURR_DMA_SET; if (flags->can_cancel_immed == 1) val |= USBPF_STATUS_CAN_CANCEL_IMMED; if (flags->doing_callback == 1) val |= USBPF_STATUS_DOING_CALLBACK; return (val); } static int usbpf_xfer_frame_is_read(struct usb_xfer *xfer, uint32_t frame) { int isread; if ((frame == 0) && (xfer->flags_int.control_xfr != 0) && (xfer->flags_int.control_hdr != 0)) { /* special case */ if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) { /* The device controller writes to memory */ isread = 1; } else { /* The host controller reads from memory */ isread = 0; } } else { isread = USB_GET_DATA_ISREAD(xfer); } return (isread); } static uint32_t usbpf_xfer_precompute_size(struct usb_xfer *xfer, int type) { uint32_t totlen; uint32_t x; uint32_t nframes; if (type == USBPF_XFERTAP_SUBMIT) nframes = xfer->nframes; else nframes = xfer->aframes; totlen = USBPF_HDR_LEN + (USBPF_FRAME_HDR_LEN * nframes); /* precompute all trace lengths */ for (x = 0; x != nframes; x++) { if (usbpf_xfer_frame_is_read(xfer, x)) { if (type != USBPF_XFERTAP_SUBMIT) { totlen += USBPF_FRAME_ALIGN( xfer->frlengths[x]); } } else { if (type == USBPF_XFERTAP_SUBMIT) { totlen += USBPF_FRAME_ALIGN( xfer->frlengths[x]); } } } return (totlen); } void usbpf_xfertap(struct usb_xfer *xfer, int type) { struct usb_bus *bus; struct usbpf_pkthdr *up; struct usbpf_framehdr *uf; usb_frlength_t offset; uint32_t totlen; uint32_t frame; uint32_t temp; uint32_t nframes; uint32_t x; uint8_t *buf; uint8_t *ptr; bus = xfer->xroot->bus; /* sanity checks */ if (bus->ifp == NULL || bus->ifp->if_bpf == NULL) return; if (!bpf_peers_present(bus->ifp->if_bpf)) return; totlen = usbpf_xfer_precompute_size(xfer, type); if (type == USBPF_XFERTAP_SUBMIT) nframes = xfer->nframes; else nframes = xfer->aframes; /* * XXX TODO XXX * * When BPF supports it we could pass a fragmented array of * buffers avoiding the data copy operation here. */ buf = ptr = malloc(totlen, M_TEMP, M_NOWAIT); if (buf == NULL) { device_printf(bus->parent, "usbpf: Out of memory\n"); return; } up = (struct usbpf_pkthdr *)ptr; ptr += USBPF_HDR_LEN; /* fill out header */ temp = device_get_unit(bus->bdev); up->up_totlen = htole32(totlen); up->up_busunit = htole32(temp); up->up_address = xfer->xroot->udev->device_index; if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) up->up_mode = USBPF_MODE_DEVICE; else up->up_mode = USBPF_MODE_HOST; up->up_type = type; up->up_xfertype = xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE; temp = usbpf_aggregate_xferflags(&xfer->flags); up->up_flags = htole32(temp); temp = usbpf_aggregate_status(&xfer->flags_int); up->up_status = htole32(temp); temp = xfer->error; up->up_error = htole32(temp); temp = xfer->interval; up->up_interval = htole32(temp); up->up_frames = htole32(nframes); temp = xfer->max_packet_size; up->up_packet_size = htole32(temp); temp = xfer->max_packet_count; up->up_packet_count = htole32(temp); temp = xfer->endpointno; up->up_endpoint = htole32(temp); up->up_speed = xfer->xroot->udev->speed; /* clear reserved area */ memset(up->up_reserved, 0, sizeof(up->up_reserved)); /* init offset and frame */ offset = 0; frame = 0; /* iterate all the USB frames and copy data, if any */ for (x = 0; x != nframes; x++) { uint32_t length; int isread; /* get length */ length = xfer->frlengths[x]; /* get frame header pointer */ uf = (struct usbpf_framehdr *)ptr; ptr += USBPF_FRAME_HDR_LEN; /* fill out packet header */ uf->length = htole32(length); uf->flags = 0; /* get information about data read/write */ isread = usbpf_xfer_frame_is_read(xfer, x); /* check if we need to copy any data */ if (isread) { if (type == USBPF_XFERTAP_SUBMIT) length = 0; else { uf->flags |= htole32( USBPF_FRAMEFLAG_DATA_FOLLOWS); } } else { if (type != USBPF_XFERTAP_SUBMIT) length = 0; else { uf->flags |= htole32( USBPF_FRAMEFLAG_DATA_FOLLOWS); } } /* check if data is read direction */ if (isread) uf->flags |= htole32(USBPF_FRAMEFLAG_READ); /* copy USB data, if any */ if (length != 0) { /* copy data */ usbd_copy_out(&xfer->frbuffers[frame], offset, ptr, length); /* align length */ temp = USBPF_FRAME_ALIGN(length); /* zero pad */ if (temp != length) memset(ptr + length, 0, temp - length); ptr += temp; } if (xfer->flags_int.isochronous_xfr) { offset += usbd_xfer_old_frame_length(xfer, x); } else { frame ++; } } bpf_tap(bus->ifp->if_bpf, buf, totlen); free(buf, M_TEMP); } diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index 664a1f885a46..f2538a78f943 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -1,3782 +1,3792 @@ /* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Jason R. Thorpe for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp */ /* * Network interface bridge support. * * TODO: * * - Currently only supports Ethernet-like interfaces (Ethernet, * 802.11, VLANs on Ethernet, etc.) Figure out a nice way * to bridge other types of interfaces (maybe consider * heterogeneous bridges). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include /* for net/if.h */ #include #include /* string functions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #if defined(INET) || defined(INET6) #include #endif #include #include #include #include #include #include #include #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* * Size of the route hash table. Must be a power of two. */ #ifndef BRIDGE_RTHASH_SIZE #define BRIDGE_RTHASH_SIZE 1024 #endif #define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) /* * Default maximum number of addresses to cache. */ #ifndef BRIDGE_RTABLE_MAX #define BRIDGE_RTABLE_MAX 2000 #endif /* * Timeout (in seconds) for entries learned dynamically. */ #ifndef BRIDGE_RTABLE_TIMEOUT #define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */ #endif /* * Number of seconds between walks of the route list. */ #ifndef BRIDGE_RTABLE_PRUNE_PERIOD #define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) #endif /* * List of capabilities to possibly mask on the member interface. */ #define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM|\ IFCAP_TXCSUM_IPV6) /* * List of capabilities to strip */ #define BRIDGE_IFCAPS_STRIP IFCAP_LRO /* * Bridge locking * * The bridge relies heavily on the epoch(9) system to protect its data * structures. This means we can safely use CK_LISTs while in NET_EPOCH, but we * must ensure there is only one writer at a time. * * That is: for read accesses we only need to be in NET_EPOCH, but for write * accesses we must hold: * * - BRIDGE_RT_LOCK, for any change to bridge_rtnodes * - BRIDGE_LOCK, for any other change * * The BRIDGE_LOCK is a sleepable lock, because it is held across ioctl() * calls to bridge member interfaces and these ioctl()s can sleep. * The BRIDGE_RT_LOCK is a non-sleepable mutex, because it is sometimes * required while we're in NET_EPOCH and then we're not allowed to sleep. */ #define BRIDGE_LOCK_INIT(_sc) do { \ sx_init(&(_sc)->sc_sx, "if_bridge"); \ mtx_init(&(_sc)->sc_rt_mtx, "if_bridge rt", NULL, MTX_DEF); \ } while (0) #define BRIDGE_LOCK_DESTROY(_sc) do { \ sx_destroy(&(_sc)->sc_sx); \ mtx_destroy(&(_sc)->sc_rt_mtx); \ } while (0) #define BRIDGE_LOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define BRIDGE_UNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define BRIDGE_LOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SX_XLOCKED) #define BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(_sc) \ MPASS(in_epoch(net_epoch_preempt) || sx_xlocked(&(_sc)->sc_sx)) #define BRIDGE_UNLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SX_UNLOCKED) #define BRIDGE_RT_LOCK(_sc) mtx_lock(&(_sc)->sc_rt_mtx) #define BRIDGE_RT_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_rt_mtx) #define BRIDGE_RT_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->sc_rt_mtx, MA_OWNED) #define BRIDGE_RT_LOCK_OR_NET_EPOCH_ASSERT(_sc) \ MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(_sc)->sc_rt_mtx)) /* * Bridge interface list entry. */ struct bridge_iflist { CK_LIST_ENTRY(bridge_iflist) bif_next; struct ifnet *bif_ifp; /* member if */ struct bstp_port bif_stp; /* STP state */ uint32_t bif_flags; /* member if flags */ int bif_savedcaps; /* saved capabilities */ uint32_t bif_addrmax; /* max # of addresses */ uint32_t bif_addrcnt; /* cur. # of addresses */ uint32_t bif_addrexceeded;/* # of address violations */ struct epoch_context bif_epoch_ctx; }; /* * Bridge route node. */ struct bridge_rtnode { CK_LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ CK_LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ struct bridge_iflist *brt_dst; /* destination if */ unsigned long brt_expire; /* expiration time */ uint8_t brt_flags; /* address flags */ uint8_t brt_addr[ETHER_ADDR_LEN]; uint16_t brt_vlan; /* vlan id */ struct vnet *brt_vnet; struct epoch_context brt_epoch_ctx; }; #define brt_ifp brt_dst->bif_ifp /* * Software state for each bridge. */ struct bridge_softc { struct ifnet *sc_ifp; /* make this an interface */ LIST_ENTRY(bridge_softc) sc_list; struct sx sc_sx; struct mtx sc_rt_mtx; uint32_t sc_brtmax; /* max # of addresses */ uint32_t sc_brtcnt; /* cur. # of addresses */ uint32_t sc_brttimeout; /* rt timeout in seconds */ struct callout sc_brcallout; /* bridge callout */ CK_LIST_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ CK_LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ CK_LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ uint32_t sc_rthash_key; /* key for hash */ CK_LIST_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ struct bstp_state sc_stp; /* STP state */ uint32_t sc_brtexceeded; /* # of cache drops */ struct ifnet *sc_ifaddr; /* member mac copied from */ struct ether_addr sc_defaddr; /* Default MAC address */ struct epoch_context sc_epoch_ctx; }; VNET_DEFINE_STATIC(struct sx, bridge_list_sx); #define V_bridge_list_sx VNET(bridge_list_sx) static eventhandler_tag bridge_detach_cookie; int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; VNET_DEFINE_STATIC(uma_zone_t, bridge_rtnode_zone); #define V_bridge_rtnode_zone VNET(bridge_rtnode_zone) -static int bridge_clone_create(struct if_clone *, int, caddr_t); -static void bridge_clone_destroy(struct ifnet *); +static int bridge_clone_create(struct if_clone *, char *, size_t, + struct ifc_data *, struct ifnet **); +static int bridge_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static int bridge_ioctl(struct ifnet *, u_long, caddr_t); static void bridge_mutecaps(struct bridge_softc *); static void bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *, int); static void bridge_ifdetach(void *arg __unused, struct ifnet *); static void bridge_init(void *); static void bridge_dummynet(struct mbuf *, struct ifnet *); static void bridge_stop(struct ifnet *, int); static int bridge_transmit(struct ifnet *, struct mbuf *); #ifdef ALTQ static void bridge_altq_start(if_t); static int bridge_altq_transmit(if_t, struct mbuf *); #endif static void bridge_qflush(struct ifnet *); static struct mbuf *bridge_input(struct ifnet *, struct mbuf *); static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static int bridge_enqueue(struct bridge_softc *, struct ifnet *, struct mbuf *); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, struct mbuf *m); static void bridge_timer(void *); static void bridge_broadcast(struct bridge_softc *, struct ifnet *, struct mbuf *, int); static void bridge_span(struct bridge_softc *, struct mbuf *); static int bridge_rtupdate(struct bridge_softc *, const uint8_t *, uint16_t, struct bridge_iflist *, int, uint8_t); static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *, uint16_t); static void bridge_rttrim(struct bridge_softc *); static void bridge_rtage(struct bridge_softc *); static void bridge_rtflush(struct bridge_softc *, int); static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, uint16_t); static void bridge_rtable_init(struct bridge_softc *); static void bridge_rtable_fini(struct bridge_softc *); static int bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *); static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *, const uint8_t *, uint16_t); static int bridge_rtnode_insert(struct bridge_softc *, struct bridge_rtnode *); static void bridge_rtnode_destroy(struct bridge_softc *, struct bridge_rtnode *); static void bridge_rtable_expire(struct ifnet *, int); static void bridge_state_change(struct ifnet *, int); static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, const char *name); static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *, struct ifnet *ifp); static void bridge_delete_member(struct bridge_softc *, struct bridge_iflist *, int); static void bridge_delete_span(struct bridge_softc *, struct bridge_iflist *); static int bridge_ioctl_add(struct bridge_softc *, void *); static int bridge_ioctl_del(struct bridge_softc *, void *); static int bridge_ioctl_gifflags(struct bridge_softc *, void *); static int bridge_ioctl_sifflags(struct bridge_softc *, void *); static int bridge_ioctl_scache(struct bridge_softc *, void *); static int bridge_ioctl_gcache(struct bridge_softc *, void *); static int bridge_ioctl_gifs(struct bridge_softc *, void *); static int bridge_ioctl_rts(struct bridge_softc *, void *); static int bridge_ioctl_saddr(struct bridge_softc *, void *); static int bridge_ioctl_sto(struct bridge_softc *, void *); static int bridge_ioctl_gto(struct bridge_softc *, void *); static int bridge_ioctl_daddr(struct bridge_softc *, void *); static int bridge_ioctl_flush(struct bridge_softc *, void *); static int bridge_ioctl_gpri(struct bridge_softc *, void *); static int bridge_ioctl_spri(struct bridge_softc *, void *); static int bridge_ioctl_ght(struct bridge_softc *, void *); static int bridge_ioctl_sht(struct bridge_softc *, void *); static int bridge_ioctl_gfd(struct bridge_softc *, void *); static int bridge_ioctl_sfd(struct bridge_softc *, void *); static int bridge_ioctl_gma(struct bridge_softc *, void *); static int bridge_ioctl_sma(struct bridge_softc *, void *); static int bridge_ioctl_sifprio(struct bridge_softc *, void *); static int bridge_ioctl_sifcost(struct bridge_softc *, void *); static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); static int bridge_ioctl_addspan(struct bridge_softc *, void *); static int bridge_ioctl_delspan(struct bridge_softc *, void *); static int bridge_ioctl_gbparam(struct bridge_softc *, void *); static int bridge_ioctl_grte(struct bridge_softc *, void *); static int bridge_ioctl_gifsstp(struct bridge_softc *, void *); static int bridge_ioctl_sproto(struct bridge_softc *, void *); static int bridge_ioctl_stxhc(struct bridge_softc *, void *); static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, int); static int bridge_ip_checkbasic(struct mbuf **mp); #ifdef INET6 static int bridge_ip6_checkbasic(struct mbuf **mp); #endif /* INET6 */ static int bridge_fragment(struct ifnet *, struct mbuf **mp, struct ether_header *, int, struct llc *); static void bridge_linkstate(struct ifnet *ifp); static void bridge_linkcheck(struct bridge_softc *sc); /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ #define VLANTAGOF(_m) \ (_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1 static struct bstp_cb_ops bridge_ops = { .bcb_state = bridge_state_change, .bcb_rtage = bridge_rtable_expire }; SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Bridge"); /* only pass IP[46] packets when pfil is enabled */ VNET_DEFINE_STATIC(int, pfil_onlyip) = 1; #define V_pfil_onlyip VNET(pfil_onlyip) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_onlyip), 0, "Only pass IP packets when pfil is enabled"); /* run pfil hooks on the bridge interface */ VNET_DEFINE_STATIC(int, pfil_bridge) = 1; #define V_pfil_bridge VNET(pfil_bridge) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_bridge), 0, "Packet filter on the bridge interface"); /* layer2 filter with ipfw */ VNET_DEFINE_STATIC(int, pfil_ipfw); #define V_pfil_ipfw VNET(pfil_ipfw) /* layer2 ARP filter with ipfw */ VNET_DEFINE_STATIC(int, pfil_ipfw_arp); #define V_pfil_ipfw_arp VNET(pfil_ipfw_arp) SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw_arp), 0, "Filter ARP packets through IPFW layer2"); /* run pfil hooks on the member interface */ VNET_DEFINE_STATIC(int, pfil_member) = 1; #define V_pfil_member VNET(pfil_member) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_member), 0, "Packet filter on the member interface"); /* run pfil hooks on the physical interface for locally destined packets */ VNET_DEFINE_STATIC(int, pfil_local_phys); #define V_pfil_local_phys VNET(pfil_local_phys) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_local_phys), 0, "Packet filter on the physical interface for locally destined packets"); /* log STP state changes */ VNET_DEFINE_STATIC(int, log_stp); #define V_log_stp VNET(log_stp) SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(log_stp), 0, "Log STP state changes"); /* share MAC with first bridge member */ VNET_DEFINE_STATIC(int, bridge_inherit_mac); #define V_bridge_inherit_mac VNET(bridge_inherit_mac) SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(bridge_inherit_mac), 0, "Inherit MAC address from the first bridge member"); VNET_DEFINE_STATIC(int, allow_llz_overlap) = 0; #define V_allow_llz_overlap VNET(allow_llz_overlap) SYSCTL_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0, "Allow overlap of link-local scope " "zones of a bridge interface and the member interfaces"); struct bridge_control { int (*bc_func)(struct bridge_softc *, void *); int bc_argsize; int bc_flags; }; #define BC_F_COPYIN 0x01 /* copy arguments in */ #define BC_F_COPYOUT 0x02 /* copy arguments out */ #define BC_F_SUSER 0x04 /* do super-user check */ const struct bridge_control bridge_control_table[] = { { bridge_ioctl_add, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_del, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gifflags, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_sifflags, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_scache, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gcache, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_gifs, sizeof(struct ifbifconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_rts, sizeof(struct ifbaconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_saddr, sizeof(struct ifbareq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sto, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gto, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_daddr, sizeof(struct ifbareq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_flush, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gpri, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_spri, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_ght, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sht, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gfd, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sfd, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gma, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sma, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifprio, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifcost, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_addspan, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_delspan, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gbparam, sizeof(struct ifbropreq), BC_F_COPYOUT }, { bridge_ioctl_grte, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_gifsstp, sizeof(struct ifbpstpconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_sproto, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_stxhc, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, }; const int bridge_control_table_size = nitems(bridge_control_table); VNET_DEFINE_STATIC(LIST_HEAD(, bridge_softc), bridge_list); #define V_bridge_list VNET(bridge_list) #define BRIDGE_LIST_LOCK_INIT(x) sx_init(&V_bridge_list_sx, \ "if_bridge list") #define BRIDGE_LIST_LOCK_DESTROY(x) sx_destroy(&V_bridge_list_sx) #define BRIDGE_LIST_LOCK(x) sx_xlock(&V_bridge_list_sx) #define BRIDGE_LIST_UNLOCK(x) sx_xunlock(&V_bridge_list_sx) VNET_DEFINE_STATIC(struct if_clone *, bridge_cloner); #define V_bridge_cloner VNET(bridge_cloner) static const char bridge_name[] = "bridge"; static void vnet_bridge_init(const void *unused __unused) { V_bridge_rtnode_zone = uma_zcreate("bridge_rtnode", sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); BRIDGE_LIST_LOCK_INIT(); LIST_INIT(&V_bridge_list); - V_bridge_cloner = if_clone_simple(bridge_name, - bridge_clone_create, bridge_clone_destroy, 0); + + struct if_clone_addreq req = { + .create_f = bridge_clone_create, + .destroy_f = bridge_clone_destroy, + .flags = IFC_F_AUTOUNIT, + }; + V_bridge_cloner = ifc_attach_cloner(bridge_name, &req); } VNET_SYSINIT(vnet_bridge_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_bridge_init, NULL); static void vnet_bridge_uninit(const void *unused __unused) { - if_clone_detach(V_bridge_cloner); + ifc_detach_cloner(V_bridge_cloner); V_bridge_cloner = NULL; BRIDGE_LIST_LOCK_DESTROY(); /* Callbacks may use the UMA zone. */ NET_EPOCH_DRAIN_CALLBACKS(); uma_zdestroy(V_bridge_rtnode_zone); } VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_bridge_uninit, NULL); static int bridge_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: bridge_dn_p = bridge_dummynet; bridge_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bridge_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, bridge_detach_cookie); bridge_dn_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t bridge_mod = { "if_bridge", bridge_modevent, 0 }; DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_bridge, 1); MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1); /* * handler for net.link.bridge.ipfw */ static int sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS) { int enable = V_pfil_ipfw; int error; error = sysctl_handle_int(oidp, &enable, 0, req); enable &= 1; if (enable != V_pfil_ipfw) { V_pfil_ipfw = enable; /* * Disable pfil so that ipfw doesnt run twice, if the user * really wants both then they can re-enable pfil_bridge and/or * pfil_member. Also allow non-ip packets as ipfw can filter by * layer2 type. */ if (V_pfil_ipfw) { V_pfil_onlyip = 0; V_pfil_bridge = 0; V_pfil_member = 0; } } return (error); } SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET | CTLFLAG_NEEDGIANT, &VNET_NAME(pfil_ipfw), 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); #ifdef VIMAGE static void bridge_reassign(struct ifnet *ifp, struct vnet *newvnet, char *arg) { struct bridge_softc *sc = ifp->if_softc; struct bridge_iflist *bif; BRIDGE_LOCK(sc); while ((bif = CK_LIST_FIRST(&sc->sc_iflist)) != NULL) bridge_delete_member(sc, bif, 0); while ((bif = CK_LIST_FIRST(&sc->sc_spanlist)) != NULL) { bridge_delete_span(sc, bif); } BRIDGE_UNLOCK(sc); ether_reassign(ifp, newvnet, arg); } #endif /* * bridge_clone_create: * * Create a new bridge instance. */ static int -bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) +bridge_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct bridge_softc *sc; struct ifnet *ifp; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } BRIDGE_LOCK_INIT(sc); sc->sc_brtmax = BRIDGE_RTABLE_MAX; sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; /* Initialize our routing table. */ bridge_rtable_init(sc); callout_init_mtx(&sc->sc_brcallout, &sc->sc_rt_mtx, 0); CK_LIST_INIT(&sc->sc_iflist); CK_LIST_INIT(&sc->sc_spanlist); ifp->if_softc = sc; - if_initname(ifp, bridge_name, unit); + if_initname(ifp, bridge_name, ifd->unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = bridge_ioctl; #ifdef ALTQ ifp->if_start = bridge_altq_start; ifp->if_transmit = bridge_altq_transmit; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); #else ifp->if_transmit = bridge_transmit; #endif ifp->if_qflush = bridge_qflush; ifp->if_init = bridge_init; ifp->if_type = IFT_BRIDGE; ether_gen_addr(ifp, &sc->sc_defaddr); bstp_attach(&sc->sc_stp, &bridge_ops); ether_ifattach(ifp, sc->sc_defaddr.octet); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_BRIDGE; #ifdef VIMAGE ifp->if_reassign = bridge_reassign; #endif BRIDGE_LIST_LOCK(); LIST_INSERT_HEAD(&V_bridge_list, sc, sc_list); BRIDGE_LIST_UNLOCK(); + *ifpp = ifp; return (0); } static void bridge_clone_destroy_cb(struct epoch_context *ctx) { struct bridge_softc *sc; sc = __containerof(ctx, struct bridge_softc, sc_epoch_ctx); BRIDGE_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } /* * bridge_clone_destroy: * * Destroy a bridge instance. */ -static void -bridge_clone_destroy(struct ifnet *ifp) +static int +bridge_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct bridge_softc *sc = ifp->if_softc; struct bridge_iflist *bif; struct epoch_tracker et; BRIDGE_LOCK(sc); bridge_stop(ifp, 1); ifp->if_flags &= ~IFF_UP; while ((bif = CK_LIST_FIRST(&sc->sc_iflist)) != NULL) bridge_delete_member(sc, bif, 0); while ((bif = CK_LIST_FIRST(&sc->sc_spanlist)) != NULL) { bridge_delete_span(sc, bif); } /* Tear down the routing table. */ bridge_rtable_fini(sc); BRIDGE_UNLOCK(sc); NET_EPOCH_ENTER(et); callout_drain(&sc->sc_brcallout); BRIDGE_LIST_LOCK(); LIST_REMOVE(sc, sc_list); BRIDGE_LIST_UNLOCK(); bstp_detach(&sc->sc_stp); #ifdef ALTQ IFQ_PURGE(&ifp->if_snd); #endif NET_EPOCH_EXIT(et); ether_ifdetach(ifp); if_free(ifp); NET_EPOCH_CALL(bridge_clone_destroy_cb, &sc->sc_epoch_ctx); + + return (0); } /* * bridge_ioctl: * * Handle a control request from the operator. */ static int bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct bridge_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct bridge_iflist *bif; struct thread *td = curthread; union { struct ifbreq ifbreq; struct ifbifconf ifbifconf; struct ifbareq ifbareq; struct ifbaconf ifbaconf; struct ifbrparam ifbrparam; struct ifbropreq ifbropreq; } args; struct ifdrv *ifd = (struct ifdrv *) data; const struct bridge_control *bc; int error = 0, oldmtu; BRIDGE_LOCK(sc); switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGDRVSPEC: case SIOCSDRVSPEC: if (ifd->ifd_cmd >= bridge_control_table_size) { error = EINVAL; break; } bc = &bridge_control_table[ifd->ifd_cmd]; if (cmd == SIOCGDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) == 0) { error = EINVAL; break; } else if (cmd == SIOCSDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) != 0) { error = EINVAL; break; } if (bc->bc_flags & BC_F_SUSER) { error = priv_check(td, PRIV_NET_BRIDGE); if (error) break; } if (ifd->ifd_len != bc->bc_argsize || ifd->ifd_len > sizeof(args)) { error = EINVAL; break; } bzero(&args, sizeof(args)); if (bc->bc_flags & BC_F_COPYIN) { error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) break; } oldmtu = ifp->if_mtu; error = (*bc->bc_func)(sc, &args); if (error) break; /* * Bridge MTU may change during addition of the first port. * If it did, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } if (bc->bc_flags & BC_F_COPYOUT) error = copyout(&args, ifd->ifd_data, ifd->ifd_len); break; case SIOCSIFFLAGS: if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ bridge_stop(ifp, 1); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ BRIDGE_UNLOCK(sc); (*ifp->if_init)(sc); BRIDGE_LOCK(sc); } break; case SIOCSIFMTU: oldmtu = sc->sc_ifp->if_mtu; if (ifr->ifr_mtu < 576) { error = EINVAL; break; } if (CK_LIST_EMPTY(&sc->sc_iflist)) { sc->sc_ifp->if_mtu = ifr->ifr_mtu; break; } CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { error = (*bif->bif_ifp->if_ioctl)(bif->bif_ifp, SIOCSIFMTU, (caddr_t)ifr); if (error != 0) { log(LOG_NOTICE, "%s: invalid MTU: %u for" " member %s\n", sc->sc_ifp->if_xname, ifr->ifr_mtu, bif->bif_ifp->if_xname); error = EINVAL; break; } } if (error) { /* Restore the previous MTU on all member interfaces. */ ifr->ifr_mtu = oldmtu; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { (*bif->bif_ifp->if_ioctl)(bif->bif_ifp, SIOCSIFMTU, (caddr_t)ifr); } } else { sc->sc_ifp->if_mtu = ifr->ifr_mtu; } break; default: /* * drop the lock as ether_ioctl() will call bridge_start() and * cause the lock to be recursed. */ BRIDGE_UNLOCK(sc); error = ether_ioctl(ifp, cmd, data); BRIDGE_LOCK(sc); break; } BRIDGE_UNLOCK(sc); return (error); } /* * bridge_mutecaps: * * Clear or restore unwanted capabilities on the member interface */ static void bridge_mutecaps(struct bridge_softc *sc) { struct bridge_iflist *bif; int enabled, mask; BRIDGE_LOCK_ASSERT(sc); /* Initial bitmask of capabilities to test */ mask = BRIDGE_IFCAPS_MASK; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { /* Every member must support it or its disabled */ mask &= bif->bif_savedcaps; } CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { enabled = bif->bif_ifp->if_capenable; enabled &= ~BRIDGE_IFCAPS_STRIP; /* strip off mask bits and enable them again if allowed */ enabled &= ~BRIDGE_IFCAPS_MASK; enabled |= mask; bridge_set_ifcap(sc, bif, enabled); } } static void bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) { struct ifnet *ifp = bif->bif_ifp; struct ifreq ifr; int error, mask, stuck; bzero(&ifr, sizeof(ifr)); ifr.ifr_reqcap = set; if (ifp->if_capenable != set) { error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr); if (error) if_printf(sc->sc_ifp, "error setting capabilities on %s: %d\n", ifp->if_xname, error); mask = BRIDGE_IFCAPS_MASK | BRIDGE_IFCAPS_STRIP; stuck = ifp->if_capenable & mask & ~set; if (stuck != 0) if_printf(sc->sc_ifp, "can't disable some capabilities on %s: 0x%x\n", ifp->if_xname, stuck); } } /* * bridge_lookup_member: * * Lookup a bridge member interface. */ static struct bridge_iflist * bridge_lookup_member(struct bridge_softc *sc, const char *name) { struct bridge_iflist *bif; struct ifnet *ifp; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { ifp = bif->bif_ifp; if (strcmp(ifp->if_xname, name) == 0) return (bif); } return (NULL); } /* * bridge_lookup_member_if: * * Lookup a bridge member interface by ifnet*. */ static struct bridge_iflist * bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) { struct bridge_iflist *bif; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp == member_ifp) return (bif); } return (NULL); } static void bridge_delete_member_cb(struct epoch_context *ctx) { struct bridge_iflist *bif; bif = __containerof(ctx, struct bridge_iflist, bif_epoch_ctx); free(bif, M_DEVBUF); } /* * bridge_delete_member: * * Delete the specified member interface. */ static void bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, int gone) { struct ifnet *ifs = bif->bif_ifp; struct ifnet *fif = NULL; struct bridge_iflist *bifl; BRIDGE_LOCK_ASSERT(sc); if (bif->bif_flags & IFBIF_STP) bstp_disable(&bif->bif_stp); ifs->if_bridge = NULL; CK_LIST_REMOVE(bif, bif_next); /* * If removing the interface that gave the bridge its mac address, set * the mac address of the bridge to the address of the next member, or * to its default address if no members are left. */ if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) { if (CK_LIST_EMPTY(&sc->sc_iflist)) { bcopy(&sc->sc_defaddr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = NULL; } else { bifl = CK_LIST_FIRST(&sc->sc_iflist); fif = bifl->bif_ifp; bcopy(IF_LLADDR(fif), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = fif; } EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } bridge_linkcheck(sc); bridge_mutecaps(sc); /* recalcuate now this interface is removed */ BRIDGE_RT_LOCK(sc); bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); BRIDGE_RT_UNLOCK(sc); KASSERT(bif->bif_addrcnt == 0, ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); ifs->if_bridge_output = NULL; ifs->if_bridge_input = NULL; ifs->if_bridge_linkstate = NULL; if (!gone) { switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: /* * Take the interface out of promiscuous mode, but only * if it was promiscuous in the first place. It might * not be if we're in the bridge_ioctl_add() error path. */ if (ifs->if_flags & IFF_PROMISC) (void) ifpromisc(ifs, 0); break; case IFT_GIF: break; default: #ifdef DIAGNOSTIC panic("bridge_delete_member: impossible"); #endif break; } /* reneable any interface capabilities */ bridge_set_ifcap(sc, bif, bif->bif_savedcaps); } bstp_destroy(&bif->bif_stp); /* prepare to free */ NET_EPOCH_CALL(bridge_delete_member_cb, &bif->bif_epoch_ctx); } /* * bridge_delete_span: * * Delete the specified span interface. */ static void bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif) { BRIDGE_LOCK_ASSERT(sc); KASSERT(bif->bif_ifp->if_bridge == NULL, ("%s: not a span interface", __func__)); CK_LIST_REMOVE(bif, bif_next); NET_EPOCH_CALL(bridge_delete_member_cb, &bif->bif_epoch_ctx); } static int bridge_ioctl_add(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif = NULL; struct ifnet *ifs; int error = 0; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); if (ifs->if_ioctl == NULL) /* must be supported */ return (EINVAL); /* If it's in the span list, it can't be a member. */ CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) return (EBUSY); if (ifs->if_bridge == sc) return (EEXIST); if (ifs->if_bridge != NULL) return (EBUSY); switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_GIF: /* permitted interface types */ break; default: return (EINVAL); } #ifdef INET6 /* * Two valid inet6 addresses with link-local scope must not be * on the parent interface and the member interfaces at the * same time. This restriction is needed to prevent violation * of link-local scope zone. Attempts to add a member * interface which has inet6 addresses when the parent has * inet6 triggers removal of all inet6 addresses on the member * interface. */ /* Check if the parent interface has a link-local scope addr. */ if (V_allow_llz_overlap == 0 && in6ifa_llaonifp(sc->sc_ifp) != NULL) { /* * If any, remove all inet6 addresses from the member * interfaces. */ CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (in6ifa_llaonifp(bif->bif_ifp)) { in6_ifdetach(bif->bif_ifp); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", bif->bif_ifp->if_xname); } } if (in6ifa_llaonifp(ifs)) { in6_ifdetach(ifs); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", ifs->if_xname); } } #endif /* Allow the first Ethernet member to define the MTU */ if (CK_LIST_EMPTY(&sc->sc_iflist)) sc->sc_ifp->if_mtu = ifs->if_mtu; else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { struct ifreq ifr; snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s", ifs->if_xname); ifr.ifr_mtu = sc->sc_ifp->if_mtu; error = (*ifs->if_ioctl)(ifs, SIOCSIFMTU, (caddr_t)&ifr); if (error != 0) { log(LOG_NOTICE, "%s: invalid MTU: %u for" " new member %s\n", sc->sc_ifp->if_xname, ifr.ifr_mtu, ifs->if_xname); return (EINVAL); } } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); bif->bif_ifp = ifs; bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; bif->bif_savedcaps = ifs->if_capenable; /* * Assign the interface's MAC address to the bridge if it's the first * member and the MAC address of the bridge has not been changed from * the default randomly generated one. */ if (V_bridge_inherit_mac && CK_LIST_EMPTY(&sc->sc_iflist) && !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr.octet, ETHER_ADDR_LEN)) { bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = ifs; EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } ifs->if_bridge = sc; ifs->if_bridge_output = bridge_output; ifs->if_bridge_input = bridge_input; ifs->if_bridge_linkstate = bridge_linkstate; bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); /* * XXX: XLOCK HERE!?! * * NOTE: insert_***HEAD*** should be safe for the traversals. */ CK_LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next); /* Set interface capabilities to the intersection set of all members */ bridge_mutecaps(sc); bridge_linkcheck(sc); /* Place the interface into promiscuous mode */ switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: error = ifpromisc(ifs, 1); break; } if (error) bridge_delete_member(sc, bif, 0); return (error); } static int bridge_ioctl_del(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bridge_delete_member(sc, bif, 0); return (0); } static int bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; struct bstp_port *bp; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bp = &bif->bif_stp; req->ifbr_ifsflags = bif->bif_flags; req->ifbr_state = bp->bp_state; req->ifbr_priority = bp->bp_priority; req->ifbr_path_cost = bp->bp_path_cost; req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; req->ifbr_proto = bp->bp_protover; req->ifbr_role = bp->bp_role; req->ifbr_stpflags = bp->bp_flags; req->ifbr_addrcnt = bif->bif_addrcnt; req->ifbr_addrmax = bif->bif_addrmax; req->ifbr_addrexceeded = bif->bif_addrexceeded; /* Copy STP state options as flags */ if (bp->bp_operedge) req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; if (bp->bp_flags & BSTP_PORT_AUTOEDGE) req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; if (bp->bp_ptp_link) req->ifbr_ifsflags |= IFBIF_BSTP_PTP; if (bp->bp_flags & BSTP_PORT_AUTOPTP) req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; if (bp->bp_flags & BSTP_PORT_ADMEDGE) req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; if (bp->bp_flags & BSTP_PORT_ADMCOST) req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; return (0); } static int bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) { struct epoch_tracker et; struct ifbreq *req = arg; struct bridge_iflist *bif; struct bstp_port *bp; int error; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bp = &bif->bif_stp; if (req->ifbr_ifsflags & IFBIF_SPAN) /* SPAN is readonly */ return (EINVAL); NET_EPOCH_ENTER(et); if (req->ifbr_ifsflags & IFBIF_STP) { if ((bif->bif_flags & IFBIF_STP) == 0) { error = bstp_enable(&bif->bif_stp); if (error) { NET_EPOCH_EXIT(et); return (error); } } } else { if ((bif->bif_flags & IFBIF_STP) != 0) bstp_disable(&bif->bif_stp); } /* Pass on STP flags */ bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0); bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0); bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0); bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0); /* Save the bits relating to the bridge */ bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK; NET_EPOCH_EXIT(et); return (0); } static int bridge_ioctl_scache(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; sc->sc_brtmax = param->ifbrp_csize; bridge_rttrim(sc); return (0); } static int bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_csize = sc->sc_brtmax; return (0); } static int bridge_ioctl_gifs(struct bridge_softc *sc, void *arg) { struct ifbifconf *bifc = arg; struct bridge_iflist *bif; struct ifbreq breq; char *buf, *outbuf; int count, buflen, len, error = 0; count = 0; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) count++; CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) count++; buflen = sizeof(breq) * count; if (bifc->ifbic_len == 0) { bifc->ifbic_len = buflen; return (0); } outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO); if (outbuf == NULL) return (ENOMEM); count = 0; buf = outbuf; len = min(bifc->ifbic_len, buflen); bzero(&breq, sizeof(breq)); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (len < sizeof(breq)) break; strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, sizeof(breq.ifbr_ifsname)); /* Fill in the ifbreq structure */ error = bridge_ioctl_gifflags(sc, &breq); if (error) break; memcpy(buf, &breq, sizeof(breq)); count++; buf += sizeof(breq); len -= sizeof(breq); } CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { if (len < sizeof(breq)) break; strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, sizeof(breq.ifbr_ifsname)); breq.ifbr_ifsflags = bif->bif_flags; breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; memcpy(buf, &breq, sizeof(breq)); count++; buf += sizeof(breq); len -= sizeof(breq); } bifc->ifbic_len = sizeof(breq) * count; error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_rts(struct bridge_softc *sc, void *arg) { struct ifbaconf *bac = arg; struct bridge_rtnode *brt; struct ifbareq bareq; char *buf, *outbuf; int count, buflen, len, error = 0; if (bac->ifbac_len == 0) return (0); count = 0; CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) count++; buflen = sizeof(bareq) * count; outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO); if (outbuf == NULL) return (ENOMEM); count = 0; buf = outbuf; len = min(bac->ifbac_len, buflen); bzero(&bareq, sizeof(bareq)); CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { if (len < sizeof(bareq)) goto out; strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname, sizeof(bareq.ifba_ifsname)); memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); bareq.ifba_vlan = brt->brt_vlan; if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && time_uptime < brt->brt_expire) bareq.ifba_expire = brt->brt_expire - time_uptime; else bareq.ifba_expire = 0; bareq.ifba_flags = brt->brt_flags; memcpy(buf, &bareq, sizeof(bareq)); count++; buf += sizeof(bareq); len -= sizeof(bareq); } out: bac->ifbac_len = sizeof(bareq) * count; error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_saddr(struct bridge_softc *sc, void *arg) { struct ifbareq *req = arg; struct bridge_iflist *bif; struct epoch_tracker et; int error; NET_EPOCH_ENTER(et); bif = bridge_lookup_member(sc, req->ifba_ifsname); if (bif == NULL) { NET_EPOCH_EXIT(et); return (ENOENT); } /* bridge_rtupdate() may acquire the lock. */ error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, req->ifba_flags); NET_EPOCH_EXIT(et); return (error); } static int bridge_ioctl_sto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; sc->sc_brttimeout = param->ifbrp_ctime; return (0); } static int bridge_ioctl_gto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_ctime = sc->sc_brttimeout; return (0); } static int bridge_ioctl_daddr(struct bridge_softc *sc, void *arg) { struct ifbareq *req = arg; return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); } static int bridge_ioctl_flush(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; BRIDGE_RT_LOCK(sc); bridge_rtflush(sc, req->ifbr_ifsflags); BRIDGE_RT_UNLOCK(sc); return (0); } static int bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_prio = bs->bs_bridge_priority; return (0); } static int bridge_ioctl_spri(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio)); } static int bridge_ioctl_ght(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_hellotime = bs->bs_bridge_htime >> 8; return (0); } static int bridge_ioctl_sht(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime)); } static int bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8; return (0); } static int bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay)); } static int bridge_ioctl_gma(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_maxage = bs->bs_bridge_max_age >> 8; return (0); } static int bridge_ioctl_sma(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage)); } static int bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); } static int bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); } static int bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bif->bif_addrmax = req->ifbr_addrmax; return (0); } static int bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif = NULL; struct ifnet *ifs; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) return (EBUSY); if (ifs->if_bridge != NULL) return (EBUSY); switch (ifs->if_type) { case IFT_ETHER: case IFT_GIF: case IFT_L2VLAN: break; default: return (EINVAL); } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); bif->bif_ifp = ifs; bif->bif_flags = IFBIF_SPAN; CK_LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next); return (0); } static int bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; struct ifnet *ifs; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) break; if (bif == NULL) return (ENOENT); bridge_delete_span(sc, bif); return (0); } static int bridge_ioctl_gbparam(struct bridge_softc *sc, void *arg) { struct ifbropreq *req = arg; struct bstp_state *bs = &sc->sc_stp; struct bstp_port *root_port; req->ifbop_maxage = bs->bs_bridge_max_age >> 8; req->ifbop_hellotime = bs->bs_bridge_htime >> 8; req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; root_port = bs->bs_root_port; if (root_port == NULL) req->ifbop_root_port = 0; else req->ifbop_root_port = root_port->bp_ifp->if_index; req->ifbop_holdcount = bs->bs_txholdcount; req->ifbop_priority = bs->bs_bridge_priority; req->ifbop_protocol = bs->bs_protover; req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; return (0); } static int bridge_ioctl_grte(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_cexceeded = sc->sc_brtexceeded; return (0); } static int bridge_ioctl_gifsstp(struct bridge_softc *sc, void *arg) { struct ifbpstpconf *bifstp = arg; struct bridge_iflist *bif; struct bstp_port *bp; struct ifbpstpreq bpreq; char *buf, *outbuf; int count, buflen, len, error = 0; count = 0; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if ((bif->bif_flags & IFBIF_STP) != 0) count++; } buflen = sizeof(bpreq) * count; if (bifstp->ifbpstp_len == 0) { bifstp->ifbpstp_len = buflen; return (0); } outbuf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO); if (outbuf == NULL) return (ENOMEM); count = 0; buf = outbuf; len = min(bifstp->ifbpstp_len, buflen); bzero(&bpreq, sizeof(bpreq)); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (len < sizeof(bpreq)) break; if ((bif->bif_flags & IFBIF_STP) == 0) continue; bp = &bif->bif_stp; bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id; bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; memcpy(buf, &bpreq, sizeof(bpreq)); count++; buf += sizeof(bpreq); len -= sizeof(bpreq); } bifstp->ifbpstp_len = sizeof(bpreq) * count; error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto)); } static int bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc)); } /* * bridge_ifdetach: * * Detach an interface from a bridge. Called when a member * interface is detaching. */ static void bridge_ifdetach(void *arg __unused, struct ifnet *ifp) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif; if (ifp->if_flags & IFF_RENAMING) return; if (V_bridge_cloner == NULL) { /* * This detach handler can be called after * vnet_bridge_uninit(). Just return in that case. */ return; } /* Check if the interface is a bridge member */ if (sc != NULL) { BRIDGE_LOCK(sc); bif = bridge_lookup_member_if(sc, ifp); if (bif != NULL) bridge_delete_member(sc, bif, 1); BRIDGE_UNLOCK(sc); return; } /* Check if the interface is a span port */ BRIDGE_LIST_LOCK(); LIST_FOREACH(sc, &V_bridge_list, sc_list) { BRIDGE_LOCK(sc); CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifp == bif->bif_ifp) { bridge_delete_span(sc, bif); break; } BRIDGE_UNLOCK(sc); } BRIDGE_LIST_UNLOCK(); } /* * bridge_init: * * Initialize a bridge interface. */ static void bridge_init(void *xsc) { struct bridge_softc *sc = (struct bridge_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; BRIDGE_LOCK(sc); callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ BRIDGE_UNLOCK(sc); } /* * bridge_stop: * * Stop the bridge interface. */ static void bridge_stop(struct ifnet *ifp, int disable) { struct bridge_softc *sc = ifp->if_softc; BRIDGE_LOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; BRIDGE_RT_LOCK(sc); callout_stop(&sc->sc_brcallout); bstp_stop(&sc->sc_stp); bridge_rtflush(sc, IFBF_FLUSHDYN); BRIDGE_RT_UNLOCK(sc); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * bridge_enqueue: * * Enqueue a packet on a bridge member interface. * */ static int bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) { int len, err = 0; short mflags; struct mbuf *m0; /* We may be sending a fragment so traverse the mbuf */ for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = NULL; len = m->m_pkthdr.len; mflags = m->m_flags; /* * If underlying interface can not do VLAN tag insertion itself * then attach a packet tag that holds it. */ if ((m->m_flags & M_VLANTAG) && (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if (m == NULL) { if_printf(dst_ifp, "unable to prepend VLAN header\n"); if_inc_counter(dst_ifp, IFCOUNTER_OERRORS, 1); continue; } m->m_flags &= ~M_VLANTAG; } M_ASSERTPKTHDR(m); /* We shouldn't transmit mbuf without pkthdr */ if ((err = dst_ifp->if_transmit(dst_ifp, m))) { m_freem(m0); if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); break; } if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len); if (mflags & M_MCAST) if_inc_counter(sc->sc_ifp, IFCOUNTER_OMCASTS, 1); } return (err); } /* * bridge_dummynet: * * Receive a queued packet from dummynet and pass it on to the output * interface. * * The mbuf has the Ethernet header already attached. */ static void bridge_dummynet(struct mbuf *m, struct ifnet *ifp) { struct bridge_softc *sc; sc = ifp->if_bridge; /* * The packet didnt originate from a member interface. This should only * ever happen if a member interface is removed while packets are * queued for it. */ if (sc == NULL) { m_freem(m); return; } if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) return; if (m == NULL) return; } bridge_enqueue(sc, ifp, m); } /* * bridge_output: * * Send output from a bridge member interface. This * performs the bridging function for locally originated * packets. * * The mbuf has the Ethernet header already attached. We must * enqueue or free the mbuf before returning. */ static int bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, struct rtentry *rt) { struct ether_header *eh; struct ifnet *bifp, *dst_if; struct bridge_softc *sc; uint16_t vlan; NET_EPOCH_ASSERT(); if (m->m_len < ETHER_HDR_LEN) { m = m_pullup(m, ETHER_HDR_LEN); if (m == NULL) return (0); } eh = mtod(m, struct ether_header *); sc = ifp->if_bridge; vlan = VLANTAGOF(m); bifp = sc->sc_ifp; /* * If bridge is down, but the original output interface is up, * go ahead and send out that interface. Otherwise, the packet * is dropped below. */ if ((bifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { dst_if = ifp; goto sendunicast; } /* * If the packet is a multicast, or we don't know a better way to * get there, send to all interfaces. */ if (ETHER_IS_MULTICAST(eh->ether_dhost)) dst_if = NULL; else dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); /* Tap any traffic not passing back out the originating interface */ if (dst_if != ifp) ETHER_BPF_MTAP(bifp, m); if (dst_if == NULL) { struct bridge_iflist *bif; struct mbuf *mc; int used = 0; bridge_span(sc, m); CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { dst_if = bif->bif_ifp; if (dst_if->if_type == IFT_GIF) continue; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; /* * If this is not the original output interface, * and the interface is participating in spanning * tree, make sure the port is in a state that * allows forwarding. */ if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) continue; if (CK_LIST_NEXT(bif, bif_next) == NULL) { used = 1; mc = m; } else { mc = m_dup(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(bifp, IFCOUNTER_OERRORS, 1); continue; } } bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); return (0); } sendunicast: /* * XXX Spanning tree consideration here? */ bridge_span(sc, m); if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) { m_freem(m); return (0); } bridge_enqueue(sc, dst_if, m); return (0); } /* * bridge_transmit: * * Do output on a bridge. * */ static int bridge_transmit(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc; struct ether_header *eh; struct ifnet *dst_if; int error = 0; sc = ifp->if_softc; ETHER_BPF_MTAP(ifp, m); eh = mtod(m, struct ether_header *); if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) && (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1)) != NULL) { error = bridge_enqueue(sc, dst_if, m); } else bridge_broadcast(sc, ifp, m, 0); return (error); } #ifdef ALTQ static void bridge_altq_start(if_t ifp) { struct ifaltq *ifq = &ifp->if_snd; struct mbuf *m; IFQ_LOCK(ifq); IFQ_DEQUEUE_NOLOCK(ifq, m); while (m != NULL) { bridge_transmit(ifp, m); IFQ_DEQUEUE_NOLOCK(ifq, m); } IFQ_UNLOCK(ifq); } static int bridge_altq_transmit(if_t ifp, struct mbuf *m) { int err; if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_ENQUEUE(&ifp->if_snd, m, err); if (err == 0) bridge_altq_start(ifp); } else err = bridge_transmit(ifp, m); return (err); } #endif /* ALTQ */ /* * The ifp->if_qflush entry point for if_bridge(4) is no-op. */ static void bridge_qflush(struct ifnet *ifp __unused) { } /* * bridge_forward: * * The forwarding function of the bridge. * * NOTE: Releases the lock on return. */ static void bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, struct mbuf *m) { struct bridge_iflist *dbif; struct ifnet *src_if, *dst_if, *ifp; struct ether_header *eh; uint16_t vlan; uint8_t *dst; int error; NET_EPOCH_ASSERT(); src_if = m->m_pkthdr.rcvif; ifp = sc->sc_ifp; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); vlan = VLANTAGOF(m); if ((sbif->bif_flags & IFBIF_STP) && sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) goto drop; eh = mtod(m, struct ether_header *); dst = eh->ether_dhost; /* If the interface is learning, record the address. */ if (sbif->bif_flags & IFBIF_LEARNING) { error = bridge_rtupdate(sc, eh->ether_shost, vlan, sbif, 0, IFBAF_DYNAMIC); /* * If the interface has addresses limits then deny any source * that is not in the cache. */ if (error && sbif->bif_addrmax) goto drop; } if ((sbif->bif_flags & IFBIF_STP) != 0 && sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING) goto drop; /* * At this point, the port either doesn't participate * in spanning tree or it is in the forwarding state. */ /* * If the packet is unicast, destined for someone on * "this" side of the bridge, drop it. */ if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { dst_if = bridge_rtlookup(sc, dst, vlan); if (src_if == dst_if) goto drop; } else { /* * Check if its a reserved multicast address, any address * listed in 802.1D section 7.12.6 may not be forwarded by the * bridge. * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F */ if (dst[0] == 0x01 && dst[1] == 0x80 && dst[2] == 0xc2 && dst[3] == 0x00 && dst[4] == 0x00 && dst[5] <= 0x0f) goto drop; /* ...forward it to all interfaces. */ if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); dst_if = NULL; } /* * If we have a destination interface which is a member of our bridge, * OR this is a unicast packet, push it through the bpf(4) machinery. * For broadcast or multicast packets, don't bother because it will * be reinjected into ether_input. We do this before we pass the packets * through the pfil(9) framework, as it is possible that pfil(9) will * drop the packet, or possibly modify it, making it difficult to debug * firewall issues on the bridge. */ if (dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) ETHER_BPF_MTAP(ifp, m); /* run the packet filter */ if (PFIL_HOOKED_IN(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_IN(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0) return; if (m == NULL) return; } if (dst_if == NULL) { bridge_broadcast(sc, src_if, m, 1); return; } /* * At this point, we're dealing with a unicast frame * going to a different interface. */ if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) goto drop; dbif = bridge_lookup_member_if(sc, dst_if); if (dbif == NULL) /* Not a member of the bridge (anymore?) */ goto drop; /* Private segments can not talk to each other */ if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE) goto drop; if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) goto drop; if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) return; if (m == NULL) return; } bridge_enqueue(sc, dst_if, m); return; drop: m_freem(m); } /* * bridge_input: * * Receive input from a member interface. Queue the packet for * bridging if it is not for us. */ static struct mbuf * bridge_input(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif, *bif2; struct ifnet *bifp; struct ether_header *eh; struct mbuf *mc, *mc2; uint16_t vlan; int error; NET_EPOCH_ASSERT(); if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return (m); bifp = sc->sc_ifp; vlan = VLANTAGOF(m); /* * Implement support for bridge monitoring. If this flag has been * set on this interface, discard the packet once we push it through * the bpf(4) machinery, but before we do, increment the byte and * packet counters associated with this interface. */ if ((bifp->if_flags & IFF_MONITOR) != 0) { m->m_pkthdr.rcvif = bifp; ETHER_BPF_MTAP(bifp, m); if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return (NULL); } bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { return (m); } eh = mtod(m, struct ether_header *); bridge_span(sc, m); if (m->m_flags & (M_BCAST|M_MCAST)) { /* Tap off 802.1D packets; they do not get forwarded. */ if (memcmp(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN) == 0) { bstp_input(&bif->bif_stp, ifp, m); /* consumes mbuf */ return (NULL); } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { return (m); } /* * Make a deep copy of the packet and enqueue the copy * for bridge processing; return the original packet for * local processing. */ mc = m_dup(m, M_NOWAIT); if (mc == NULL) { return (m); } /* Perform the bridge forwarding function with the copy. */ bridge_forward(sc, bif, mc); /* * Reinject the mbuf as arriving on the bridge so we have a * chance at claiming multicast packets. We can not loop back * here from ether_input as a bridge is never a member of a * bridge. */ KASSERT(bifp->if_bridge == NULL, ("loop created in bridge_input")); mc2 = m_dup(m, M_NOWAIT); if (mc2 != NULL) { /* Keep the layer3 header aligned */ int i = min(mc2->m_pkthdr.len, max_protohdr); mc2 = m_copyup(mc2, i, ETHER_ALIGN); } if (mc2 != NULL) { mc2->m_pkthdr.rcvif = bifp; (*bifp->if_input)(bifp, mc2); } /* Return the original packet for local processing. */ return (m); } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { return (m); } #if (defined(INET) || defined(INET6)) # define OR_CARP_CHECK_WE_ARE_DST(iface) \ || ((iface)->if_carp \ && (*carp_forus_p)((iface), eh->ether_dhost)) # define OR_CARP_CHECK_WE_ARE_SRC(iface) \ || ((iface)->if_carp \ && (*carp_forus_p)((iface), eh->ether_shost)) #else # define OR_CARP_CHECK_WE_ARE_DST(iface) # define OR_CARP_CHECK_WE_ARE_SRC(iface) #endif #ifdef INET6 # define OR_PFIL_HOOKED_INET6 \ || PFIL_HOOKED_IN(V_inet6_pfil_head) #else # define OR_PFIL_HOOKED_INET6 #endif #define GRAB_OUR_PACKETS(iface) \ if ((iface)->if_type == IFT_GIF) \ continue; \ /* It is destined for us. */ \ if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_DST((iface)) \ ) { \ if (bif->bif_flags & IFBIF_LEARNING) { \ error = bridge_rtupdate(sc, eh->ether_shost, \ vlan, bif, 0, IFBAF_DYNAMIC); \ if (error && bif->bif_addrmax) { \ m_freem(m); \ return (NULL); \ } \ } \ m->m_pkthdr.rcvif = iface; \ if ((iface) == ifp) { \ /* Skip bridge processing... src == dest */ \ return (m); \ } \ /* It's passing over or to the bridge, locally. */ \ ETHER_BPF_MTAP(bifp, m); \ if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); \ if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ /* Filter on the physical interface. */ \ if (V_pfil_local_phys && (PFIL_HOOKED_IN(V_inet_pfil_head) \ OR_PFIL_HOOKED_INET6)) { \ if (bridge_pfil(&m, NULL, ifp, \ PFIL_IN) != 0 || m == NULL) { \ return (NULL); \ } \ } \ if ((iface) != bifp) \ ETHER_BPF_MTAP(iface, m); \ return (m); \ } \ \ /* We just received a packet that we sent out. */ \ if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_SRC((iface)) \ ) { \ m_freem(m); \ return (NULL); \ } /* * Unicast. Make sure it's not for the bridge. */ do { GRAB_OUR_PACKETS(bifp) } while (0); /* * Give a chance for ifp at first priority. This will help when the * packet comes through the interface like VLAN's with the same MACs * on several interfaces from the same bridge. This also will save * some CPU cycles in case the destination interface and the input * interface (eq ifp) are the same. */ do { GRAB_OUR_PACKETS(ifp) } while (0); /* Now check the all bridge members. */ CK_LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) { GRAB_OUR_PACKETS(bif2->bif_ifp) } #undef OR_CARP_CHECK_WE_ARE_DST #undef OR_CARP_CHECK_WE_ARE_SRC #undef OR_PFIL_HOOKED_INET6 #undef GRAB_OUR_PACKETS /* Perform the bridge forwarding function. */ bridge_forward(sc, bif, m); return (NULL); } /* * bridge_broadcast: * * Send a frame to all interfaces that are members of * the bridge, except for the one on which the packet * arrived. * * NOTE: Releases the lock on return. */ static void bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, struct mbuf *m, int runfilt) { struct bridge_iflist *dbif, *sbif; struct mbuf *mc; struct ifnet *dst_if; int used = 0, i; NET_EPOCH_ASSERT(); sbif = bridge_lookup_member_if(sc, src_if); /* Filter on the bridge interface before broadcasting */ if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) return; if (m == NULL) return; } CK_LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) { dst_if = dbif->bif_ifp; if (dst_if == src_if) continue; /* Private segments can not talk to each other */ if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)) continue; if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) continue; if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 && (m->m_flags & (M_BCAST|M_MCAST)) == 0) continue; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; if (CK_LIST_NEXT(dbif, bif_next) == NULL) { mc = m; used = 1; } else { mc = m_dup(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } /* * Filter on the output interface. Pass a NULL bridge interface * pointer so we do not redundantly filter on the bridge for * each interface we broadcast on. */ if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (used == 0) { /* Keep the layer3 header aligned */ i = min(mc->m_pkthdr.len, max_protohdr); mc = m_copyup(mc, i, ETHER_ALIGN); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) continue; if (mc == NULL) continue; } bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); } /* * bridge_span: * * Duplicate a packet out one or more interfaces that are in span mode, * the original mbuf is unmodified. */ static void bridge_span(struct bridge_softc *sc, struct mbuf *m) { struct bridge_iflist *bif; struct ifnet *dst_if; struct mbuf *mc; NET_EPOCH_ASSERT(); if (CK_LIST_EMPTY(&sc->sc_spanlist)) return; CK_LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { dst_if = bif->bif_ifp; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; mc = m_dup(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } bridge_enqueue(sc, dst_if, mc); } } /* * bridge_rtupdate: * * Add a bridge routing entry. */ static int bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, struct bridge_iflist *bif, int setflags, uint8_t flags) { struct bridge_rtnode *brt; int error; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); /* Check the source address is valid and not multicast. */ if (ETHER_IS_MULTICAST(dst) || (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) return (EINVAL); /* 802.1p frames map to vlan 1 */ if (vlan == 0) vlan = 1; /* * A route for this destination might already exist. If so, * update it, otherwise create a new one. */ if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) { BRIDGE_RT_LOCK(sc); /* Check again, now that we have the lock. There could have * been a race and we only want to insert this once. */ if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) != NULL) { BRIDGE_RT_UNLOCK(sc); return (0); } if (sc->sc_brtcnt >= sc->sc_brtmax) { sc->sc_brtexceeded++; BRIDGE_RT_UNLOCK(sc); return (ENOSPC); } /* Check per interface address limits (if enabled) */ if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) { bif->bif_addrexceeded++; BRIDGE_RT_UNLOCK(sc); return (ENOSPC); } /* * Allocate a new bridge forwarding node, and * initialize the expiration time and Ethernet * address. */ brt = uma_zalloc(V_bridge_rtnode_zone, M_NOWAIT | M_ZERO); if (brt == NULL) { BRIDGE_RT_UNLOCK(sc); return (ENOMEM); } brt->brt_vnet = curvnet; if (bif->bif_flags & IFBIF_STICKY) brt->brt_flags = IFBAF_STICKY; else brt->brt_flags = IFBAF_DYNAMIC; memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN); brt->brt_vlan = vlan; if ((error = bridge_rtnode_insert(sc, brt)) != 0) { uma_zfree(V_bridge_rtnode_zone, brt); BRIDGE_RT_UNLOCK(sc); return (error); } brt->brt_dst = bif; bif->bif_addrcnt++; BRIDGE_RT_UNLOCK(sc); } if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && brt->brt_dst != bif) { BRIDGE_RT_LOCK(sc); brt->brt_dst->bif_addrcnt--; brt->brt_dst = bif; brt->brt_dst->bif_addrcnt++; BRIDGE_RT_UNLOCK(sc); } if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) brt->brt_expire = time_uptime + sc->sc_brttimeout; if (setflags) brt->brt_flags = flags; return (0); } /* * bridge_rtlookup: * * Lookup the destination interface for an address. */ static struct ifnet * bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; NET_EPOCH_ASSERT(); if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL) return (NULL); return (brt->brt_ifp); } /* * bridge_rttrim: * * Trim the routine table so that we have a number * of routing entries less than or equal to the * maximum number. */ static void bridge_rttrim(struct bridge_softc *sc) { struct bridge_rtnode *brt, *nbrt; NET_EPOCH_ASSERT(); BRIDGE_RT_LOCK_ASSERT(sc); /* Make sure we actually need to do this. */ if (sc->sc_brtcnt <= sc->sc_brtmax) return; /* Force an aging cycle; this might trim enough addresses. */ bridge_rtage(sc); if (sc->sc_brtcnt <= sc->sc_brtmax) return; CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { bridge_rtnode_destroy(sc, brt); if (sc->sc_brtcnt <= sc->sc_brtmax) return; } } } /* * bridge_timer: * * Aging timer for the bridge. */ static void bridge_timer(void *arg) { struct bridge_softc *sc = arg; BRIDGE_RT_LOCK_ASSERT(sc); /* Destruction of rtnodes requires a proper vnet context */ CURVNET_SET(sc->sc_ifp->if_vnet); bridge_rtage(sc); if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); CURVNET_RESTORE(); } /* * bridge_rtage: * * Perform an aging cycle. */ static void bridge_rtage(struct bridge_softc *sc) { struct bridge_rtnode *brt, *nbrt; BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { if (time_uptime >= brt->brt_expire) bridge_rtnode_destroy(sc, brt); } } } /* * bridge_rtflush: * * Remove all dynamic addresses from the bridge. */ static void bridge_rtflush(struct bridge_softc *sc, int full) { struct bridge_rtnode *brt, *nbrt; BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) bridge_rtnode_destroy(sc, brt); } } /* * bridge_rtdaddr: * * Remove an address from the table. */ static int bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; int found = 0; BRIDGE_RT_LOCK(sc); /* * If vlan is zero then we want to delete for all vlans so the lookup * may return more than one. */ while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) { bridge_rtnode_destroy(sc, brt); found = 1; } BRIDGE_RT_UNLOCK(sc); return (found ? 0 : ENOENT); } /* * bridge_rtdelete: * * Delete routes to a speicifc member interface. */ static void bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) { struct bridge_rtnode *brt, *nbrt; BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if (brt->brt_ifp == ifp && (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) bridge_rtnode_destroy(sc, brt); } } /* * bridge_rtable_init: * * Initialize the route table for this bridge. */ static void bridge_rtable_init(struct bridge_softc *sc) { int i; sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, M_DEVBUF, M_WAITOK); for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) CK_LIST_INIT(&sc->sc_rthash[i]); sc->sc_rthash_key = arc4random(); CK_LIST_INIT(&sc->sc_rtlist); } /* * bridge_rtable_fini: * * Deconstruct the route table for this bridge. */ static void bridge_rtable_fini(struct bridge_softc *sc) { KASSERT(sc->sc_brtcnt == 0, ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt)); free(sc->sc_rthash, M_DEVBUF); } /* * The following hash function is adapted from "Hash Functions" by Bob Jenkins * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). */ #define mix(a, b, c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (/*CONSTCOND*/0) static __inline uint32_t bridge_rthash(struct bridge_softc *sc, const uint8_t *addr) { uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key; b += addr[5] << 8; b += addr[4]; a += addr[3] << 24; a += addr[2] << 16; a += addr[1] << 8; a += addr[0]; mix(a, b, c); return (c & BRIDGE_RTHASH_MASK); } #undef mix static int bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b) { int i, d; for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { d = ((int)a[i]) - ((int)b[i]); } return (d); } /* * bridge_rtnode_lookup: * * Look up a bridge route node for the specified destination. Compare the * vlan id or if zero then just return the first match. */ static struct bridge_rtnode * bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; uint32_t hash; int dir; BRIDGE_RT_LOCK_OR_NET_EPOCH_ASSERT(sc); hash = bridge_rthash(sc, addr); CK_LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr); if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0)) return (brt); if (dir > 0) return (NULL); } return (NULL); } /* * bridge_rtnode_insert: * * Insert the specified bridge node into the route table. We * assume the entry is not already in the table. */ static int bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) { struct bridge_rtnode *lbrt; uint32_t hash; int dir; BRIDGE_RT_LOCK_ASSERT(sc); hash = bridge_rthash(sc, brt->brt_addr); lbrt = CK_LIST_FIRST(&sc->sc_rthash[hash]); if (lbrt == NULL) { CK_LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash); goto out; } do { dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) return (EEXIST); if (dir > 0) { CK_LIST_INSERT_BEFORE(lbrt, brt, brt_hash); goto out; } if (CK_LIST_NEXT(lbrt, brt_hash) == NULL) { CK_LIST_INSERT_AFTER(lbrt, brt, brt_hash); goto out; } lbrt = CK_LIST_NEXT(lbrt, brt_hash); } while (lbrt != NULL); #ifdef DIAGNOSTIC panic("bridge_rtnode_insert: impossible"); #endif out: CK_LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list); sc->sc_brtcnt++; return (0); } static void bridge_rtnode_destroy_cb(struct epoch_context *ctx) { struct bridge_rtnode *brt; brt = __containerof(ctx, struct bridge_rtnode, brt_epoch_ctx); CURVNET_SET(brt->brt_vnet); uma_zfree(V_bridge_rtnode_zone, brt); CURVNET_RESTORE(); } /* * bridge_rtnode_destroy: * * Destroy a bridge rtnode. */ static void bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) { BRIDGE_RT_LOCK_ASSERT(sc); CK_LIST_REMOVE(brt, brt_hash); CK_LIST_REMOVE(brt, brt_list); sc->sc_brtcnt--; brt->brt_dst->bif_addrcnt--; NET_EPOCH_CALL(bridge_rtnode_destroy_cb, &brt->brt_epoch_ctx); } /* * bridge_rtable_expire: * * Set the expiry time for all routes on an interface. */ static void bridge_rtable_expire(struct ifnet *ifp, int age) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_rtnode *brt; CURVNET_SET(ifp->if_vnet); BRIDGE_RT_LOCK(sc); /* * If the age is zero then flush, otherwise set all the expiry times to * age for the interface */ if (age == 0) bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN); else { CK_LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { /* Cap the expiry time to 'age' */ if (brt->brt_ifp == ifp && brt->brt_expire > time_uptime + age && (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) brt->brt_expire = time_uptime + age; } } BRIDGE_RT_UNLOCK(sc); CURVNET_RESTORE(); } /* * bridge_state_change: * * Callback from the bridgestp code when a port changes states. */ static void bridge_state_change(struct ifnet *ifp, int state) { struct bridge_softc *sc = ifp->if_bridge; static const char *stpstates[] = { "disabled", "listening", "learning", "forwarding", "blocking", "discarding" }; CURVNET_SET(ifp->if_vnet); if (V_log_stp) log(LOG_NOTICE, "%s: state changed to %s on %s\n", sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname); CURVNET_RESTORE(); } /* * Send bridge packets through pfil if they are one of the types pfil can deal * with, or if they are ARP or REVARP. (pfil will pass ARP and REVARP without * question.) If *bifp or *ifp are NULL then packet filtering is skipped for * that interface. */ static int bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) { int snap, error, i, hlen; struct ether_header *eh1, eh2; struct ip *ip; struct llc llc1; u_int16_t ether_type; pfil_return_t rv; snap = 0; error = -1; /* Default error if not error == 0 */ #if 0 /* we may return with the IP fields swapped, ensure its not shared */ KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__)); #endif if (V_pfil_bridge == 0 && V_pfil_member == 0 && V_pfil_ipfw == 0) return (0); /* filtering is disabled */ i = min((*mp)->m_pkthdr.len, max_protohdr); if ((*mp)->m_len < i) { *mp = m_pullup(*mp, i); if (*mp == NULL) { printf("%s: m_pullup failed\n", __func__); return (-1); } } eh1 = mtod(*mp, struct ether_header *); ether_type = ntohs(eh1->ether_type); /* * Check for SNAP/LLC. */ if (ether_type < ETHERMTU) { struct llc *llc2 = (struct llc *)(eh1 + 1); if ((*mp)->m_len >= ETHER_HDR_LEN + 8 && llc2->llc_dsap == LLC_SNAP_LSAP && llc2->llc_ssap == LLC_SNAP_LSAP && llc2->llc_control == LLC_UI) { ether_type = htons(llc2->llc_un.type_snap.ether_type); snap = 1; } } /* * If we're trying to filter bridge traffic, don't look at anything * other than IP and ARP traffic. If the filter doesn't understand * IPv6, don't allow IPv6 through the bridge either. This is lame * since if we really wanted, say, an AppleTalk filter, we are hosed, * but of course we don't have an AppleTalk filter to begin with. * (Note that since pfil doesn't understand ARP it will pass *ALL* * ARP traffic.) */ switch (ether_type) { case ETHERTYPE_ARP: case ETHERTYPE_REVARP: if (V_pfil_ipfw_arp == 0) return (0); /* Automatically pass */ break; case ETHERTYPE_IP: #ifdef INET6 case ETHERTYPE_IPV6: #endif /* INET6 */ break; default: /* * Check to see if the user wants to pass non-ip * packets, these will not be checked by pfil(9) and * passed unconditionally so the default is to drop. */ if (V_pfil_onlyip) goto bad; } /* Run the packet through pfil before stripping link headers */ if (PFIL_HOOKED_OUT(V_link_pfil_head) && V_pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) { switch (pfil_run_hooks(V_link_pfil_head, mp, ifp, dir, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } } /* Strip off the Ethernet header and keep a copy. */ m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); m_adj(*mp, ETHER_HDR_LEN); /* Strip off snap header, if present */ if (snap) { m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1); m_adj(*mp, sizeof(struct llc)); } /* * Check the IP header for alignment and errors */ if (dir == PFIL_IN) { switch (ether_type) { case ETHERTYPE_IP: error = bridge_ip_checkbasic(mp); break; #ifdef INET6 case ETHERTYPE_IPV6: error = bridge_ip6_checkbasic(mp); break; #endif /* INET6 */ default: error = 0; } if (error) goto bad; } error = 0; /* * Run the packet through pfil */ rv = PFIL_PASS; switch (ether_type) { case ETHERTYPE_IP: /* * Run pfil on the member interface and the bridge, both can * be skipped by clearing pfil_member or pfil_bridge. * * Keep the order: * in_if -> bridge_if -> out_if */ if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != PFIL_PASS) break; if (V_pfil_member && ifp != NULL && (rv = pfil_run_hooks(V_inet_pfil_head, mp, ifp, dir, NULL)) != PFIL_PASS) break; if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != PFIL_PASS) break; /* check if we need to fragment the packet */ /* bridge_fragment generates a mbuf chain of packets */ /* that already include eth headers */ if (V_pfil_member && ifp != NULL && dir == PFIL_OUT) { i = (*mp)->m_pkthdr.len; if (i > ifp->if_mtu) { error = bridge_fragment(ifp, mp, &eh2, snap, &llc1); return (error); } } /* Recalculate the ip checksum. */ ip = mtod(*mp, struct ip *); hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) goto bad; if (hlen > (*mp)->m_len) { if ((*mp = m_pullup(*mp, hlen)) == NULL) goto bad; ip = mtod(*mp, struct ip *); if (ip == NULL) goto bad; } ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(*mp, hlen); break; #ifdef INET6 case ETHERTYPE_IPV6: if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != PFIL_PASS) break; if (V_pfil_member && ifp != NULL && (rv = pfil_run_hooks(V_inet6_pfil_head, mp, ifp, dir, NULL)) != PFIL_PASS) break; if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != PFIL_PASS) break; break; #endif } switch (rv) { case PFIL_CONSUMED: return (0); case PFIL_DROPPED: return (EACCES); default: break; } error = -1; /* * Finally, put everything back the way it was and return */ if (snap) { M_PREPEND(*mp, sizeof(struct llc), M_NOWAIT); if (*mp == NULL) return (error); bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); } M_PREPEND(*mp, ETHER_HDR_LEN, M_NOWAIT); if (*mp == NULL) return (error); bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); return (0); bad: m_freem(*mp); *mp = NULL; return (error); } /* * Perform basic checks on header size since * pfil assumes ip_input has already processed * it for it. Cut-and-pasted from ip_input.c. * Given how simple the IPv6 version is, * does the IPv4 version really need to be * this complicated? * * XXX Should we update ipstat here, or not? * XXX Right now we update ipstat but not * XXX csum_counter. */ static int bridge_ip_checkbasic(struct mbuf **mp) { struct mbuf *m = *mp; struct ip *ip; int len, hlen; u_short sum; if (*mp == NULL) return (-1); if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { if ((m = m_copyup(m, sizeof(struct ip), (max_linkhdr + 3) & ~3)) == NULL) { /* XXXJRT new stat, please */ KMOD_IPSTAT_INC(ips_toosmall); goto bad; } } else if (__predict_false(m->m_len < sizeof (struct ip))) { if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { KMOD_IPSTAT_INC(ips_toosmall); goto bad; } } ip = mtod(m, struct ip *); if (ip == NULL) goto bad; if (ip->ip_v != IPVERSION) { KMOD_IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ KMOD_IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { KMOD_IPSTAT_INC(ips_badhlen); goto bad; } ip = mtod(m, struct ip *); if (ip == NULL) goto bad; } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { KMOD_IPSTAT_INC(ips_badsum); goto bad; } /* Retrieve the packet length. */ len = ntohs(ip->ip_len); /* * Check for additional length bogosity */ if (len < hlen) { KMOD_IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < len) { KMOD_IPSTAT_INC(ips_tooshort); goto bad; } /* Checks out, proceed */ *mp = m; return (0); bad: *mp = m; return (-1); } #ifdef INET6 /* * Same as above, but for IPv6. * Cut-and-pasted from ip6_input.c. * XXX Should we update ip6stat, or not? */ static int bridge_ip6_checkbasic(struct mbuf **mp) { struct mbuf *m = *mp; struct ip6_hdr *ip6; /* * If the IPv6 header is not aligned, slurp it up into a new * mbuf with space for link headers, in the event we forward * it. Otherwise, if it is aligned, make sure the entire base * IPv6 header is in the first mbuf of the chain. */ if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { struct ifnet *inifp = m->m_pkthdr.rcvif; if ((m = m_copyup(m, sizeof(struct ip6_hdr), (max_linkhdr + 3) & ~3)) == NULL) { /* XXXJRT new stat, please */ IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { struct ifnet *inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } /* Checks out, proceed */ *mp = m; return (0); bad: *mp = m; return (-1); } #endif /* INET6 */ /* * bridge_fragment: * * Fragment mbuf chain in multiple packets and prepend ethernet header. */ static int bridge_fragment(struct ifnet *ifp, struct mbuf **mp, struct ether_header *eh, int snap, struct llc *llc) { struct mbuf *m = *mp, *nextpkt = NULL, *mprev = NULL, *mcur = NULL; struct ip *ip; int error = -1; if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == NULL) goto dropit; ip = mtod(m, struct ip *); m->m_pkthdr.csum_flags |= CSUM_IP; error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist); if (error) goto dropit; /* * Walk the chain and re-add the Ethernet header for * each mbuf packet. */ for (mcur = m; mcur; mcur = mcur->m_nextpkt) { nextpkt = mcur->m_nextpkt; mcur->m_nextpkt = NULL; if (snap) { M_PREPEND(mcur, sizeof(struct llc), M_NOWAIT); if (mcur == NULL) { error = ENOBUFS; if (mprev != NULL) mprev->m_nextpkt = nextpkt; goto dropit; } bcopy(llc, mtod(mcur, caddr_t),sizeof(struct llc)); } M_PREPEND(mcur, ETHER_HDR_LEN, M_NOWAIT); if (mcur == NULL) { error = ENOBUFS; if (mprev != NULL) mprev->m_nextpkt = nextpkt; goto dropit; } bcopy(eh, mtod(mcur, caddr_t), ETHER_HDR_LEN); /* * The previous two M_PREPEND could have inserted one or two * mbufs in front so we have to update the previous packet's * m_nextpkt. */ mcur->m_nextpkt = nextpkt; if (mprev != NULL) mprev->m_nextpkt = mcur; else { /* The first mbuf in the original chain needs to be * updated. */ *mp = mcur; } mprev = mcur; } KMOD_IPSTAT_INC(ips_fragmented); return (error); dropit: for (mcur = *mp; mcur; mcur = m) { /* droping the full packet chain */ m = mcur->m_nextpkt; m_freem(mcur); } return (error); } static void bridge_linkstate(struct ifnet *ifp) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif; struct epoch_tracker et; NET_EPOCH_ENTER(et); bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { NET_EPOCH_EXIT(et); return; } bridge_linkcheck(sc); bstp_linkstate(&bif->bif_stp); NET_EPOCH_EXIT(et); } static void bridge_linkcheck(struct bridge_softc *sc) { struct bridge_iflist *bif; int new_link, hasls; BRIDGE_LOCK_OR_NET_EPOCH_ASSERT(sc); new_link = LINK_STATE_DOWN; hasls = 0; /* Our link is considered up if at least one of our ports is active */ CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp->if_capabilities & IFCAP_LINKSTATE) hasls++; if (bif->bif_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } if (!CK_LIST_EMPTY(&sc->sc_iflist) && !hasls) { /* If no interfaces support link-state then we default to up */ new_link = LINK_STATE_UP; } if_link_state_change(sc->sc_ifp, new_link); } diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c index 1dc5cb695d86..6bf7481065cd 100644 --- a/sys/net/if_epair.c +++ b/sys/net/if_epair.c @@ -1,889 +1,891 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 The FreeBSD Foundation * Copyright (c) 2009-2021 Bjoern A. Zeeb * * This software was developed by CK Software GmbH under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * A pair of virtual back-to-back connected ethernet like interfaces * (``two interfaces with a virtual cross-over cable''). * * This is mostly intended to be used to provide connectivity between * different virtual network stack instances. */ #include __FBSDID("$FreeBSD$"); #include "opt_rss.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #ifdef INET #include #endif #ifdef INET6 #include #endif #endif #include -static int epair_clone_match(struct if_clone *, const char *); -static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t); -static int epair_clone_destroy(struct if_clone *, struct ifnet *); - static const char epairname[] = "epair"; #define RXRSIZE 4096 /* Probably overkill by 4-8x. */ static MALLOC_DEFINE(M_EPAIR, epairname, "Pair of virtual cross-over connected Ethernet-like interfaces"); VNET_DEFINE_STATIC(struct if_clone *, epair_cloner); #define V_epair_cloner VNET(epair_cloner) static unsigned int next_index = 0; #define EPAIR_LOCK_INIT() mtx_init(&epair_n_index_mtx, "epairidx", \ NULL, MTX_DEF) #define EPAIR_LOCK_DESTROY() mtx_destroy(&epair_n_index_mtx) #define EPAIR_LOCK() mtx_lock(&epair_n_index_mtx) #define EPAIR_UNLOCK() mtx_unlock(&epair_n_index_mtx) #define BIT_QUEUE_TASK 0 #define BIT_MBUF_QUEUED 1 struct epair_softc; struct epair_queue { int id; struct buf_ring *rxring[2]; volatile int ridx; /* 0 || 1 */ volatile long state; /* taskqueue coordination */ struct task tx_task; struct epair_softc *sc; }; static struct mtx epair_n_index_mtx; struct epair_softc { struct ifnet *ifp; /* This ifp. */ struct ifnet *oifp; /* other ifp of pair. */ int num_queues; struct epair_queue *queues; struct ifmedia media; /* Media config (fake). */ STAILQ_ENTRY(epair_softc) entry; }; struct epair_tasks_t { int tasks; struct taskqueue *tq[MAXCPU]; }; static struct epair_tasks_t epair_tasks; static void epair_clear_mbuf(struct mbuf *m) { /* Remove any CSUM_SND_TAG as ether_input will barf. */ if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { m_snd_tag_rele(m->m_pkthdr.snd_tag); m->m_pkthdr.snd_tag = NULL; m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; } m_tag_delete_nonpersistent(m); } static void epair_if_input(struct epair_softc *sc, struct epair_queue *q, int ridx) { struct ifnet *ifp; struct mbuf *m; ifp = sc->ifp; CURVNET_SET(ifp->if_vnet); while (! buf_ring_empty(q->rxring[ridx])) { m = buf_ring_dequeue_mc(q->rxring[ridx]); if (m == NULL) continue; MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); (*ifp->if_input)(ifp, m); } CURVNET_RESTORE(); } static void epair_tx_start_deferred(void *arg, int pending) { struct epair_queue *q = (struct epair_queue *)arg; struct epair_softc *sc = q->sc; int ridx, nidx; if_ref(sc->ifp); ridx = atomic_load_int(&q->ridx); do { nidx = (ridx == 0) ? 1 : 0; } while (!atomic_fcmpset_int(&q->ridx, &ridx, nidx)); epair_if_input(sc, q, ridx); atomic_clear_long(&q->state, (1 << BIT_QUEUE_TASK)); if (atomic_testandclear_long(&q->state, BIT_MBUF_QUEUED)) taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task); if_rele(sc->ifp); } static int epair_menq(struct mbuf *m, struct epair_softc *osc) { struct ifnet *ifp, *oifp; int len, ret; int ridx; short mflags; struct epair_queue *q = NULL; uint32_t bucket; #ifdef RSS struct ether_header *eh; #endif /* * I know this looks weird. We pass the "other sc" as we need that one * and can get both ifps from it as well. */ oifp = osc->ifp; ifp = osc->oifp; M_ASSERTPKTHDR(m); epair_clear_mbuf(m); if_setrcvif(m, oifp); M_SETFIB(m, oifp->if_fib); /* Save values as once the mbuf is queued, it's not ours anymore. */ len = m->m_pkthdr.len; mflags = m->m_flags; MPASS(m->m_nextpkt == NULL); MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); #ifdef RSS ret = rss_m2bucket(m, &bucket); if (ret) { /* Actually hash the packet. */ eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { #ifdef INET case ETHERTYPE_IP: rss_soft_m2cpuid_v4(m, 0, &bucket); break; #endif #ifdef INET6 case ETHERTYPE_IPV6: rss_soft_m2cpuid_v6(m, 0, &bucket); break; #endif default: bucket = 0; break; } } bucket %= osc->num_queues; #else bucket = 0; #endif q = &osc->queues[bucket]; atomic_set_long(&q->state, (1 << BIT_MBUF_QUEUED)); ridx = atomic_load_int(&q->ridx); ret = buf_ring_enqueue(q->rxring[ridx], m); if (ret != 0) { /* Ring is full. */ if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* * IFQ_HANDOFF_ADJ/ip_handoff() update statistics, * but as we bypass all this we have to duplicate * the logic another time. */ if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); /* Someone else received the packet. */ if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1); if (!atomic_testandset_long(&q->state, BIT_QUEUE_TASK)) taskqueue_enqueue(epair_tasks.tq[bucket], &q->tx_task); return (0); } static void epair_start(struct ifnet *ifp) { struct mbuf *m; struct epair_softc *sc; struct ifnet *oifp; /* * We get packets here from ether_output via if_handoff() * and need to put them into the input queue of the oifp * and will put the packet into the receive-queue (rxq) of the * other interface (oifp) of our pair. */ sc = ifp->if_softc; oifp = sc->oifp; sc = oifp->if_softc; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; M_ASSERTPKTHDR(m); BPF_MTAP(ifp, m); /* In case either interface is not usable drop the packet. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ifp->if_flags & IFF_UP) == 0 || (oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) == 0) { m_freem(m); continue; } (void) epair_menq(m, sc); } } static int epair_transmit(struct ifnet *ifp, struct mbuf *m) { struct epair_softc *sc; struct ifnet *oifp; int error; #ifdef ALTQ int len; short mflags; #endif if (m == NULL) return (0); M_ASSERTPKTHDR(m); /* * We are not going to use the interface en/dequeue mechanism * on the TX side. We are called from ether_output_frame() * and will put the packet into the receive-queue (rxq) of the * other interface (oifp) of our pair. */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENETDOWN); } BPF_MTAP(ifp, m); /* * In case the outgoing interface is not usable, * drop the packet. */ sc = ifp->if_softc; oifp = sc->oifp; if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) == 0) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (0); } #ifdef ALTQ len = m->m_pkthdr.len; mflags = m->m_flags; /* Support ALTQ via the classic if_start() path. */ IF_LOCK(&ifp->if_snd); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error); if (error) if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); IF_UNLOCK(&ifp->if_snd); if (!error) { if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); epair_start(ifp); } return (error); } IF_UNLOCK(&ifp->if_snd); #endif error = epair_menq(m, oifp->if_softc); return (error); } static void epair_qflush(struct ifnet *ifp __unused) { } static int epair_media_change(struct ifnet *ifp __unused) { /* Do nothing. */ return (0); } static void epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr) { imr->ifm_status = IFM_AVALID | IFM_ACTIVE; imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; } static int epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct epair_softc *sc; struct ifreq *ifr; int error; ifr = (struct ifreq *)data; switch (cmd) { case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: sc = ifp->if_softc; error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd); break; case SIOCSIFMTU: /* We basically allow all kinds of MTUs. */ ifp->if_mtu = ifr->ifr_mtu; error = 0; break; default: /* Let the common ethernet handler process this. */ error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void epair_init(void *dummy __unused) { } /* * Interface cloning functions. * We use our private ones so that we can create/destroy our secondary * device along with the primary one. */ static int epair_clone_match(struct if_clone *ifc, const char *name) { const char *cp; /* * Our base name is epair. * Our interfaces will be named epair[ab]. * So accept anything of the following list: * - epair * - epair * but not the epair[ab] versions. */ if (strncmp(epairname, name, sizeof(epairname)-1) != 0) return (0); for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static void epair_clone_add(struct if_clone *ifc, struct epair_softc *scb) { struct ifnet *ifp; uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ ifp = scb->ifp; /* Copy epairNa etheraddr and change the last byte. */ memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN); eaddr[5] = 0x0b; ether_ifattach(ifp, eaddr); if_clone_addif(ifc, ifp); } static struct epair_softc * epair_alloc_sc(struct if_clone *ifc) { struct epair_softc *sc; struct ifnet *ifp = if_alloc(IFT_ETHER); if (ifp == NULL) return (NULL); sc = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); sc->ifp = ifp; sc->num_queues = epair_tasks.tasks; sc->queues = mallocarray(sc->num_queues, sizeof(struct epair_queue), M_EPAIR, M_WAITOK); for (int i = 0; i < sc->num_queues; i++) { struct epair_queue *q = &sc->queues[i]; q->id = i; q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL); q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL); q->ridx = 0; q->state = 0; q->sc = sc; NET_TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q); } /* Initialise pseudo media types. */ ifmedia_init(&sc->media, 0, epair_media_change, epair_media_status); ifmedia_add(&sc->media, IFM_ETHER | IFM_10G_T, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_10G_T); return (sc); } static void epair_setup_ifp(struct epair_softc *sc, char *name, int unit) { struct ifnet *ifp = sc->ifp; ifp->if_softc = sc; strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_flags |= IFF_KNOWSEPOCH; ifp->if_capabilities = IFCAP_VLAN_MTU; ifp->if_capenable = IFCAP_VLAN_MTU; ifp->if_transmit = epair_transmit; ifp->if_qflush = epair_qflush; ifp->if_start = epair_start; ifp->if_ioctl = epair_ioctl; ifp->if_init = epair_init; if_setsendqlen(ifp, ifqmaxlen); if_setsendqready(ifp); ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */ } static void epair_generate_mac(struct epair_softc *sc, uint8_t *eaddr) { uint32_t key[3]; uint32_t hash; uint64_t hostid; EPAIR_LOCK(); #ifdef SMP /* Get an approximate distribution. */ hash = next_index % mp_ncpus; #else hash = 0; #endif EPAIR_UNLOCK(); /* * Calculate the etheraddr hashing the hostid and the * interface index. The result would be hopefully unique. * Note that the "a" component of an epair instance may get moved * to a different VNET after creation. In that case its index * will be freed and the index can get reused by new epair instance. * Make sure we do not create same etheraddr again. */ getcredhostid(curthread->td_ucred, (unsigned long *)&hostid); if (hostid == 0) arc4rand(&hostid, sizeof(hostid), 0); struct ifnet *ifp = sc->ifp; EPAIR_LOCK(); if (ifp->if_index > next_index) next_index = ifp->if_index; else next_index++; key[0] = (uint32_t)next_index; EPAIR_UNLOCK(); key[1] = (uint32_t)(hostid & 0xffffffff); key[2] = (uint32_t)((hostid >> 32) & 0xfffffffff); hash = jenkins_hash32(key, 3, 0); eaddr[0] = 0x02; memcpy(&eaddr[1], &hash, 4); eaddr[5] = 0x0a; } static void epair_free_sc(struct epair_softc *sc) { if (sc == NULL) return; if_free(sc->ifp); ifmedia_removeall(&sc->media); for (int i = 0; i < sc->num_queues; i++) { struct epair_queue *q = &sc->queues[i]; buf_ring_free(q->rxring[0], M_EPAIR); buf_ring_free(q->rxring[1], M_EPAIR); } free(sc->queues, M_EPAIR); free(sc, M_EPAIR); } static int -epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +epair_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct epair_softc *sca, *scb; struct ifnet *ifp; char *dp; int error, unit, wildcard; uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* Try to see if a special unit was requested. */ error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); error = ifc_alloc_unit(ifc, &unit); if (error != 0) return (error); /* * If no unit had been given, we need to adjust the ifName. * Also make sure there is space for our extra [ab] suffix. */ for (dp = name; *dp != '\0'; dp++); if (wildcard) { error = snprintf(dp, len - (dp - name), "%d", unit); if (error > len - (dp - name) - 1) { /* ifName too long. */ ifc_free_unit(ifc, unit); return (ENOSPC); } dp += error; } if (len - (dp - name) - 1 < 1) { /* No space left for our [ab] suffix. */ ifc_free_unit(ifc, unit); return (ENOSPC); } *dp = 'b'; /* Must not change dp so we can replace 'a' by 'b' later. */ *(dp+1) = '\0'; /* Check if 'a' and 'b' interfaces already exist. */ if (ifunit(name) != NULL) return (EEXIST); *dp = 'a'; if (ifunit(name) != NULL) return (EEXIST); /* Allocate memory for both [ab] interfaces */ sca = epair_alloc_sc(ifc); scb = epair_alloc_sc(ifc); if (sca == NULL || scb == NULL) { epair_free_sc(sca); epair_free_sc(scb); ifc_free_unit(ifc, unit); return (ENOSPC); } /* * Cross-reference the interfaces so we will be able to free both. */ sca->oifp = scb->ifp; scb->oifp = sca->ifp; /* Finish initialization of interface a. */ ifp = sca->ifp; epair_setup_ifp(sca, name, unit); epair_generate_mac(sca, eaddr); ether_ifattach(ifp, eaddr); /* Swap the name and finish initialization of interface b. */ *dp = 'b'; epair_setup_ifp(scb, name, unit); ifp = scb->ifp; /* We need to play some tricks here for the second interface. */ strlcpy(name, epairname, len); /* Correctly set the name for the cloner list. */ strlcpy(name, scb->ifp->if_xname, len); epair_clone_add(ifc, scb); /* * Restore name to a as the ifp for this will go into the * cloner list for the initial call. */ strlcpy(name, sca->ifp->if_xname, len); /* Tell the world, that we are ready to rock. */ sca->ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(sca->ifp, LINK_STATE_UP); scb->ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(scb->ifp, LINK_STATE_UP); + *ifpp = sca->ifp; + return (0); } static void epair_drain_rings(struct epair_softc *sc) { int ridx; struct mbuf *m; for (ridx = 0; ridx < 2; ridx++) { for (int i = 0; i < sc->num_queues; i++) { struct epair_queue *q = &sc->queues[i]; do { m = buf_ring_dequeue_sc(q->rxring[ridx]); if (m == NULL) break; m_freem(m); } while (1); } } } static int -epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct ifnet *oifp; struct epair_softc *sca, *scb; int unit, error; /* * In case we called into if_clone_destroyif() ourselves * again to remove the second interface, the softc will be * NULL. In that case so not do anything but return success. */ if (ifp->if_softc == NULL) return (0); unit = ifp->if_dunit; sca = ifp->if_softc; oifp = sca->oifp; scb = oifp->if_softc; /* Frist get the interfaces down and detached. */ if_link_state_change(ifp, LINK_STATE_DOWN); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(oifp, LINK_STATE_DOWN); oifp->if_drv_flags &= ~IFF_DRV_RUNNING; ether_ifdetach(ifp); ether_ifdetach(oifp); /* Third free any queued packets and all the resources. */ CURVNET_SET_QUIET(oifp->if_vnet); epair_drain_rings(scb); oifp->if_softc = NULL; error = if_clone_destroyif(ifc, oifp); if (error) panic("%s: if_clone_destroyif() for our 2nd iface failed: %d", __func__, error); epair_free_sc(scb); CURVNET_RESTORE(); epair_drain_rings(sca); epair_free_sc(sca); /* Last free the cloner unit. */ ifc_free_unit(ifc, unit); return (0); } static void vnet_epair_init(const void *unused __unused) { - - V_epair_cloner = if_clone_advanced(epairname, 0, - epair_clone_match, epair_clone_create, epair_clone_destroy); + struct if_clone_addreq req = { + .match_f = epair_clone_match, + .create_f = epair_clone_create, + .destroy_f = epair_clone_destroy, + }; + V_epair_cloner = ifc_attach_cloner(epairname, &req); } VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_epair_init, NULL); static void vnet_epair_uninit(const void *unused __unused) { - if_clone_detach(V_epair_cloner); + ifc_detach_cloner(V_epair_cloner); } VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_epair_uninit, NULL); static int epair_mod_init(void) { char name[32]; epair_tasks.tasks = 0; #ifdef RSS int cpu; CPU_FOREACH(cpu) { cpuset_t cpu_mask; /* Pin to this CPU so we get appropriate NUMA allocations. */ thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); snprintf(name, sizeof(name), "epair_task_%d", cpu); epair_tasks.tq[cpu] = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &epair_tasks.tq[cpu]); CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset(&epair_tasks.tq[cpu], 1, PI_NET, &cpu_mask, "%s", name); epair_tasks.tasks++; } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); #else snprintf(name, sizeof(name), "epair_task"); epair_tasks.tq[0] = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &epair_tasks.tq[0]); taskqueue_start_threads(&epair_tasks.tq[0], 1, PI_NET, "%s", name); epair_tasks.tasks = 1; #endif return (0); } static void epair_mod_cleanup(void) { for (int i = 0; i < epair_tasks.tasks; i++) { taskqueue_drain_all(epair_tasks.tq[i]); taskqueue_free(epair_tasks.tq[i]); } } static int epair_modevent(module_t mod, int type, void *data) { int ret; switch (type) { case MOD_LOAD: EPAIR_LOCK_INIT(); ret = epair_mod_init(); if (ret != 0) return (ret); if (bootverbose) printf("%s: %s initialized.\n", __func__, epairname); break; case MOD_UNLOAD: epair_mod_cleanup(); EPAIR_LOCK_DESTROY(); if (bootverbose) printf("%s: %s unloaded.\n", __func__, epairname); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t epair_mod = { "if_epair", epair_modevent, 0 }; DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); MODULE_VERSION(if_epair, 3); diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index 8e273c4ed391..7c5e0127d7ff 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -1,2734 +1,2743 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * Copyright (c) 2014, 2016 Marcelo Araujo * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif #define LAGG_SX_INIT(_sc) sx_init(&(_sc)->sc_sx, "if_lagg sx") #define LAGG_SX_DESTROY(_sc) sx_destroy(&(_sc)->sc_sx) #define LAGG_XLOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define LAGG_XUNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define LAGG_SXLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_LOCKED) #define LAGG_XLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_XLOCKED) /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; struct lagg_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx); #define V_lagg_list_mtx VNET(lagg_list_mtx) #define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ "if_lagg list", NULL, MTX_DEF) #define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) #define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; -static int lagg_clone_create(struct if_clone *, int, caddr_t); -static void lagg_clone_destroy(struct ifnet *); +static int lagg_clone_create(struct if_clone *, char *, size_t, + struct ifc_data *, struct ifnet **); +static int lagg_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface"); static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input_ethernet(struct ifnet *, struct mbuf *); static struct mbuf *lagg_input_infiniband(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); #if defined(KERN_TLS) || defined(RATELIMIT) static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int lagg_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int lagg_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void lagg_snd_tag_free(struct m_snd_tag *); static struct m_snd_tag *lagg_next_snd_tag(struct m_snd_tag *); static void lagg_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); static void lagg_setcaps(struct lagg_port *, int cap, int cap2); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit_ethernet(struct ifnet *, struct mbuf *); static int lagg_transmit_infiniband(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); /* Simple round robin */ static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *); static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* Broadcast */ static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *); static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ static const struct lagg_proto { lagg_proto pr_num; void (*pr_attach)(struct lagg_softc *); void (*pr_detach)(struct lagg_softc *); int (*pr_start)(struct lagg_softc *, struct mbuf *); struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, struct mbuf *); int (*pr_addport)(struct lagg_port *); void (*pr_delport)(struct lagg_port *); void (*pr_linkstate)(struct lagg_port *); void (*pr_init)(struct lagg_softc *); void (*pr_stop)(struct lagg_softc *); void (*pr_lladdr)(struct lagg_softc *); void (*pr_request)(struct lagg_softc *, void *); void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { { .pr_num = LAGG_PROTO_NONE }, { .pr_num = LAGG_PROTO_ROUNDROBIN, .pr_attach = lagg_rr_attach, .pr_start = lagg_rr_start, .pr_input = lagg_rr_input, }, { .pr_num = LAGG_PROTO_FAILOVER, .pr_start = lagg_fail_start, .pr_input = lagg_fail_input, }, { .pr_num = LAGG_PROTO_LOADBALANCE, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, .pr_addport = lagg_lb_port_create, .pr_delport = lagg_lb_port_destroy, }, { .pr_num = LAGG_PROTO_LACP, .pr_attach = lagg_lacp_attach, .pr_detach = lagg_lacp_detach, .pr_start = lagg_lacp_start, .pr_input = lagg_lacp_input, .pr_addport = lacp_port_create, .pr_delport = lacp_port_destroy, .pr_linkstate = lacp_linkstate, .pr_init = lacp_init, .pr_stop = lacp_stop, .pr_lladdr = lagg_lacp_lladdr, .pr_request = lacp_req, .pr_portreq = lacp_portreq, }, { .pr_num = LAGG_PROTO_BROADCAST, .pr_start = lagg_bcast_start, .pr_input = lagg_bcast_input, }, }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Link Aggregation"); /* Allow input on any failover links */ VNET_DEFINE_STATIC(int, lagg_failover_rx_all); #define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); /* Default value for using flowid */ VNET_DEFINE_STATIC(int, def_use_flowid) = 0; #define V_def_use_flowid VNET(def_use_flowid) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); /* Default value for using numa */ VNET_DEFINE_STATIC(int, def_use_numa) = 1; #define V_def_use_numa VNET(def_use_numa) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN, &VNET_NAME(def_use_numa), 0, "Use numa to steer flows"); /* Default value for flowid shift */ VNET_DEFINE_STATIC(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); static void vnet_lagg_init(const void *unused __unused) { LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); - V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, - lagg_clone_destroy, 0); + struct if_clone_addreq req = { + .create_f = lagg_clone_create, + .destroy_f = lagg_clone_destroy, + .flags = IFC_F_AUTOUNIT, + }; + V_lagg_cloner = ifc_attach_cloner(laggname, &req); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); static void vnet_lagg_uninit(const void *unused __unused) { - if_clone_detach(V_lagg_cloner); + ifc_detach_cloner(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_lagg_uninit, NULL); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: lagg_input_ethernet_p = lagg_input_ethernet; lagg_input_infiniband_p = lagg_input_infiniband; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); lagg_input_ethernet_p = NULL; lagg_input_infiniband_p = NULL; lagg_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); MODULE_DEPEND(if_lagg, if_infiniband, 1, 1, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) { LAGG_XLOCK_ASSERT(sc); KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", __func__, sc)); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "using proto %u\n", pr); if (lagg_protos[pr].pr_attach != NULL) lagg_protos[pr].pr_attach(sc); sc->sc_proto = pr; } static void lagg_proto_detach(struct lagg_softc *sc) { lagg_proto pr; LAGG_XLOCK_ASSERT(sc); pr = sc->sc_proto; sc->sc_proto = LAGG_PROTO_NONE; if (lagg_protos[pr].pr_detach != NULL) lagg_protos[pr].pr_detach(sc); } static int lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_start(sc, m)); } static struct mbuf * lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); } static int lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_addport == NULL) return (0); else return (lagg_protos[sc->sc_proto].pr_addport(lp)); } static void lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_delport != NULL) lagg_protos[sc->sc_proto].pr_delport(lp); } static void lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) lagg_protos[sc->sc_proto].pr_linkstate(lp); } static void lagg_proto_init(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_init != NULL) lagg_protos[sc->sc_proto].pr_init(sc); } static void lagg_proto_stop(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_stop != NULL) lagg_protos[sc->sc_proto].pr_stop(sc); } static void lagg_proto_lladdr(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) lagg_protos[sc->sc_proto].pr_lladdr(sc); } static void lagg_proto_request(struct lagg_softc *sc, void *v) { if (lagg_protos[sc->sc_proto].pr_request != NULL) lagg_protos[sc->sc_proto].pr_request(sc, v); } static void lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) { if (lagg_protos[sc->sc_proto].pr_portreq != NULL) lagg_protos[sc->sc_proto].pr_portreq(lp, v); } /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); LAGG_XUNLOCK(sc); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); LAGG_XUNLOCK(sc); } static int -lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) +lagg_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct iflaggparam iflp; struct lagg_softc *sc; struct ifnet *ifp; int if_type; int error; static const uint8_t eaddr[LAGG_ADDR_LEN]; - if (params != NULL) { - error = copyin(params, &iflp, sizeof(iflp)); + if (ifd->params != NULL) { + error = ifc_copyin(ifd, &iflp, sizeof(iflp)); if (error) return (error); switch (iflp.lagg_type) { case LAGG_TYPE_ETHERNET: if_type = IFT_ETHER; break; case LAGG_TYPE_INFINIBAND: if_type = IFT_INFINIBAND; break; default: return (EINVAL); } } else { if_type = IFT_ETHER; } sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(if_type); if (ifp == NULL) { free(sc, M_LAGG); return (ENOSPC); } LAGG_SX_INIT(sc); mtx_init(&sc->sc_mtx, "lagg-mtx", NULL, MTX_DEF); callout_init_mtx(&sc->sc_watchdog, &sc->sc_mtx, 0); LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; if (V_def_use_numa) sc->sc_opts |= LAGG_OPT_USE_NUMA; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4; lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); CK_SLIST_INIT(&sc->sc_ports); switch (if_type) { case IFT_ETHER: /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); - if_initname(ifp, laggname, unit); + if_initname(ifp, laggname, ifd->unit); ifp->if_transmit = lagg_transmit_ethernet; break; case IFT_INFINIBAND: - if_initname(ifp, laggname, unit); + if_initname(ifp, laggname, ifd->unit); ifp->if_transmit = lagg_transmit_infiniband; break; default: break; } ifp->if_softc = sc; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; ifp->if_ratelimit_query = lagg_ratelimit_query; #endif ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG or IFT_INFINIBANDLAG. */ switch (if_type) { case IFT_ETHER: ether_ifattach(ifp, eaddr); break; case IFT_INFINIBAND: infiniband_ifattach(ifp, eaddr, sc->sc_bcast_addr); break; default: break; } sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ LAGG_LIST_LOCK(); SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_XUNLOCK(sc); + *ifpp = ifp; return (0); } -static void -lagg_clone_destroy(struct ifnet *ifp) +static int +lagg_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_XLOCK(sc); sc->sc_destroying = 1; lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ lagg_proto_detach(sc); LAGG_XUNLOCK(sc); switch (ifp->if_type) { case IFT_ETHER: ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); break; case IFT_INFINIBAND: infiniband_ifdetach(ifp); break; default: break; } if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); mtx_destroy(&sc->sc_mtx); LAGG_SX_DESTROY(sc); free(sc, M_LAGG); + + return (0); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap, cap2, ena, ena2, pena, pena2; uint64_t hwa; struct ifnet_hw_tsomax hw_tsomax; LAGG_XLOCK_ASSERT(sc); /* Get common enabled capabilities for the lagg ports */ ena = ena2 = ~0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { ena &= lp->lp_ifp->if_capenable; ena2 &= lp->lp_ifp->if_capenable2; } if (CK_SLIST_FIRST(&sc->sc_ports) == NULL) ena = ena2 = 0; /* * Apply common enabled capabilities back to the lagg ports. * May require several iterations if they are dependent. */ do { pena = ena; pena2 = ena2; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setcaps(lp, ena, ena2); ena &= lp->lp_ifp->if_capenable; ena2 &= lp->lp_ifp->if_capenable2; } } while (pena != ena || pena2 != ena2); /* Get other capabilities from the lagg ports */ cap = cap2 = ~0; hwa = ~(uint64_t)0; memset(&hw_tsomax, 0, sizeof(hw_tsomax)); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; cap2 &= lp->lp_ifp->if_capabilities2; hwa &= lp->lp_ifp->if_hwassist; if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } if (CK_SLIST_FIRST(&sc->sc_ports) == NULL) cap = cap2 = hwa = 0; if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_capenable2 != ena2 || sc->sc_ifp->if_hwassist != hwa || if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capabilities2 = cap2; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_capenable2 = ena2; sc->sc_ifp->if_hwassist = hwa; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; struct ifreq ifr; int error, i, oldmtu; int if_type; uint64_t *pval; LAGG_XLOCK_ASSERT(sc); if (sc->sc_ifp == ifp) { if_printf(sc->sc_ifp, "cannot add a lagg to itself as a port\n"); return (EINVAL); } if (sc->sc_destroying == 1) return (ENXIO); /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } switch (sc->sc_ifp->if_type) { case IFT_ETHER: /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return (EPROTONOSUPPORT); if_type = IFT_IEEE8023ADLAG; break; case IFT_INFINIBAND: /* XXX Disallow non-infiniband interfaces */ if (ifp->if_type != IFT_INFINIBAND) return (EPROTONOSUPPORT); if_type = IFT_INFINIBANDLAG; break; default: break; } /* Allow the first Ethernet member to define the MTU */ oldmtu = -1; if (CK_SLIST_EMPTY(&sc->sc_ports)) { sc->sc_ifp->if_mtu = ifp->if_mtu; } else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if (ifp->if_ioctl == NULL) { if_printf(sc->sc_ifp, "cannot change MTU for %s\n", ifp->if_xname); return (EINVAL); } oldmtu = ifp->if_mtu; strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_mtu = sc->sc_ifp->if_mtu; error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); if (error != 0) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (error); } ifr.ifr_mtu = oldmtu; } lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ LAGG_LIST_LOCK(); SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (E2BIG); } #endif } } LAGG_LIST_UNLOCK(); if_ref(ifp); lp->lp_ifp = ifp; bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ifp->if_addrlen); lp->lp_ifcapenable = ifp->if_capenable; if (CK_SLIST_EMPTY(&sc->sc_ports)) { bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ifp->if_addrlen); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } else { if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ifp->if_addrlen); } lagg_setflags(lp, 1); if (CK_SLIST_EMPTY(&sc->sc_ports)) sc->sc_primary = lp; /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = if_type; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; /* Read port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) *pval = ifp->if_get_counter(ifp, i); /* * Insert into the list of ports. * Keep ports sorted by if_index. It is handy, when configuration * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( CK_SLIST_NEXT(tlp, lp_entries) == NULL || ((struct lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index > ifp->if_index)) break; } if (tlp != NULL) CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries); else CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; lagg_setmulti(lp); if ((error = lagg_proto_addport(sc, lp)) != 0) { /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (error); } /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_SXLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static void lagg_port_destroy_cb(epoch_context_t ec) { struct lagg_port *lp; struct ifnet *ifp; lp = __containerof(ec, struct lagg_port, lp_epoch_ctx); ifp = lp->lp_ifp; if_rele(ifp); free(lp, M_LAGG); } static int lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr, *lp0; struct ifnet *ifp = lp->lp_ifp; uint64_t *pval, vdiff; int i; LAGG_XLOCK_ASSERT(sc); if (rundelport) lagg_proto_delport(sc, lp); if (lp->lp_detaching == 0) lagg_clrmulti(lp); /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Update detached port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) { vdiff = ifp->if_get_counter(ifp, i) - *pval; sc->detached_counters.val[i] += vdiff; } /* Finally, remove the port from the lagg */ CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[LAGG_ADDR_LEN]; if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL) bzero(&lladdr, LAGG_ADDR_LEN); else bcopy(lp0->lp_lladdr, lladdr, LAGG_ADDR_LEN); sc->sc_primary = lp0; if (sc->sc_destroying == 0) { bcopy(lladdr, IF_LLADDR(sc->sc_ifp), sc->sc_ifp->if_addrlen); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); /* * Update lladdr for each port (new primary needs update * as well, to switch from old lladdr to its 'real' one). * We can skip this if the lagg is being destroyed. */ CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) if_setlladdr(lp_ptr->lp_ifp, lladdr, lp_ptr->lp_ifp->if_addrlen); } } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); if (lp->lp_detaching == 0) { lagg_setflags(lp, 0); lagg_setcaps(lp, lp->lp_ifcapenable, lp->lp_ifcapenable2); if_setlladdr(ifp, lp->lp_lladdr, ifp->if_addrlen); } /* * free port and release it's ifnet reference after a grace period has * elapsed. */ NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct epoch_tracker et; struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; /* Should be checked by the caller */ switch (ifp->if_type) { case IFT_IEEE8023ADLAG: case IFT_INFINIBANDLAG: if ((lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; break; default: goto fallback; } switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } NET_EPOCH_ENTER(et); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; NET_EPOCH_EXIT(et); break; } lagg_port2req(lp, rp); NET_EPOCH_EXIT(et); break; case SIOCSIFCAP: case SIOCSIFCAPNV: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_XLOCK(sc); lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp != NULL && lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * Requests counter @cnt data. * * Counter value is calculated the following way: * 1) for each port, sum difference between current and "initial" measurements. * 2) add lagg logical interface counters. * 3) add data from detached_counters array. * * We also do the following things on ports attach/detach: * 1) On port attach we store all counters it has into port_counter array. * 2) On port detach we add the different between "initial" and * current counters data to detached_counters array. */ static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt) { struct epoch_tracker et; struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lpifp; uint64_t newval, oldval, vsum; /* Revise this when we've got non-generic counters. */ KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); sc = (struct lagg_softc *)ifp->if_softc; vsum = 0; NET_EPOCH_ENTER(et); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* Saved attached value */ oldval = lp->port_counters.val[cnt]; /* current value */ lpifp = lp->lp_ifp; newval = lpifp->if_get_counter(lpifp, cnt); /* Calculate diff and save new */ vsum += newval - oldval; } NET_EPOCH_EXIT(et); /* * Add counter data which might be added by upper * layer protocols operating on logical interface. */ vsum += if_get_counter_default(ifp, cnt); /* * Add counter data from detached ports counters */ vsum += sc->detached_counters.val[cnt]; return (vsum); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: if (lp != NULL) return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_XLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_watchdog_infiniband(void *arg) { struct epoch_tracker et; struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *ifp; struct ifnet *lp_ifp; sc = arg; /* * Because infiniband nodes have a fixed MAC address, which is * generated by the so-called GID, we need to regularly update * the link level address of the parent lagg device when * the active port changes. Possibly we could piggy-back on * link up/down events aswell, but using a timer also provides * a guarantee against too frequent events. This operation * does not have to be atomic. */ NET_EPOCH_ENTER(et); lp = lagg_link_active(sc, sc->sc_primary); if (lp != NULL) { ifp = sc->sc_ifp; lp_ifp = lp->lp_ifp; if (ifp != NULL && lp_ifp != NULL && (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen) != 0 || memcmp(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen) != 0)) { memcpy(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen); memcpy(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen); CURVNET_SET(ifp->if_vnet); EVENTHANDLER_INVOKE(iflladdr_event, ifp); CURVNET_RESTORE(); } } NET_EPOCH_EXIT(et); callout_reset(&sc->sc_watchdog, hz, &lagg_watchdog_infiniband, arg); } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; struct lagg_port *lp; LAGG_XLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { LAGG_XUNLOCK(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; /* * Update the port lladdrs if needed. * This might be if_setlladdr() notification * that lladdr has been changed. */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp), ifp->if_addrlen) != 0) if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ifp->if_addrlen); } lagg_proto_init(sc); if (ifp->if_type == IFT_INFINIBAND) { mtx_lock(&sc->sc_mtx); lagg_watchdog_infiniband(sc); mtx_unlock(&sc->sc_mtx); } LAGG_XUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_XLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); mtx_lock(&sc->sc_mtx); callout_stop(&sc->sc_watchdog); mtx_unlock(&sc->sc_mtx); callout_drain(&sc->sc_watchdog); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct epoch_tracker et; struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0, oldmtu; bzero(&rpbuf, sizeof(rpbuf)); /* XXX: This can race with lagg_clone_destroy. */ switch (cmd) { case SIOCGLAGG: LAGG_XLOCK(sc); buflen = sc->sc_count * sizeof(struct lagg_reqport); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); ra->ra_proto = sc->sc_proto; lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_XUNLOCK(sc); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } /* Infiniband only supports the failover protocol. */ if (ra->ra_proto != LAGG_PROTO_FAILOVER && ifp->if_type == IFT_INFINIBAND) { error = EPROTONOSUPPORT; break; } LAGG_XLOCK(sc); lagg_proto_detach(sc); lagg_proto_attach(sc, ra->ra_proto); LAGG_XUNLOCK(sc); break; case SIOCGLAGGOPTS: LAGG_XLOCK(sc); ro->ro_opts = sc->sc_opts; if (sc->sc_proto == LAGG_PROTO_LACP) { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; if (lsc->lsc_debug.lsc_tx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_TXTEST; if (lsc->lsc_debug.lsc_rx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; if (lsc->lsc_fast_timeout != 0) ro->ro_opts |= LAGG_OPT_LACP_FAST_TIMO; ro->ro_active = sc->sc_active; } else { ro->ro_active = 0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } ro->ro_bkt = sc->sc_stride; ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; LAGG_XUNLOCK(sc); break; case SIOCSLAGGOPTS: error = priv_check(td, PRIV_NET_LAGG); if (error) break; /* * The stride option was added without defining a corresponding * LAGG_OPT flag, so handle a non-zero value before checking * anything else to preserve compatibility. */ LAGG_XLOCK(sc); if (ro->ro_opts == 0 && ro->ro_bkt != 0) { if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) { LAGG_XUNLOCK(sc); error = EINVAL; break; } sc->sc_stride = ro->ro_bkt; } if (ro->ro_opts == 0) { LAGG_XUNLOCK(sc); break; } /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. */ int valid, lacp; switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_USE_NUMA: case -LAGG_OPT_USE_NUMA: case LAGG_OPT_FLOWIDSHIFT: case LAGG_OPT_RR_LIMIT: valid = 1; lacp = 0; break; case LAGG_OPT_LACP_TXTEST: case -LAGG_OPT_LACP_TXTEST: case LAGG_OPT_LACP_RXTEST: case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: case LAGG_OPT_LACP_FAST_TIMO: case -LAGG_OPT_LACP_FAST_TIMO: valid = lacp = 1; break; default: valid = lacp = 0; break; } if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ error = EINVAL; LAGG_XUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } /* * Store new options into sc->sc_opts except for * FLOWIDSHIFT, RR and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) { if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN || ro->ro_bkt == 0) { error = EINVAL; LAGG_XUNLOCK(sc); break; } sc->sc_stride = ro->ro_bkt; } else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; struct lacp_port *lp; lsc = (struct lacp_softc *)sc->sc_psc; switch (ro->ro_opts) { case LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 1; break; case -LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 0; break; case LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 1; break; case -LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 0; break; case LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 1; break; case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; case LAGG_OPT_LACP_FAST_TIMO: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state |= LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 1; break; case -LAGG_OPT_LACP_FAST_TIMO: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state &= ~LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 0; break; } } LAGG_XUNLOCK(sc); break; case SIOCGLAGGFLAGS: rf->rf_flags = 0; LAGG_XLOCK(sc); if (sc->sc_flags & MBUF_HASHFLAG_L2) rf->rf_flags |= LAGG_F_HASHL2; if (sc->sc_flags & MBUF_HASHFLAG_L3) rf->rf_flags |= LAGG_F_HASHL3; if (sc->sc_flags & MBUF_HASHFLAG_L4) rf->rf_flags |= LAGG_F_HASHL4; LAGG_XUNLOCK(sc); break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_XLOCK(sc); sc->sc_flags = 0; if (rf->rf_flags & LAGG_F_HASHL2) sc->sc_flags |= MBUF_HASHFLAG_L2; if (rf->rf_flags & LAGG_F_HASHL3) sc->sc_flags |= MBUF_HASHFLAG_L3; if (rf->rf_flags & LAGG_F_HASHL4) sc->sc_flags |= MBUF_HASHFLAG_L4; LAGG_XUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } NET_EPOCH_ENTER(et); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; NET_EPOCH_EXIT(et); if_rele(tpif); break; } lagg_port2req(lp, rp); NET_EPOCH_EXIT(et); if_rele(tpif); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } #ifdef INET6 /* * A laggport interface should not have inet6 address * because two interfaces with a valid link-local * scope zone must not be merged in any form. This * restriction is needed to prevent violation of * link-local scope zone. Attempts to add a laggport * interface which has inet6 addresses triggers * removal of all inet6 addresses on the member * interface. */ if (in6ifa_llaonifp(tpif)) { in6_ifdetach(tpif); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", tpif->if_xname); } #endif oldmtu = ifp->if_mtu; LAGG_XLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_XUNLOCK(sc); if_rele(tpif); /* * LAGG MTU may change during addition of the first port. * If it did, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } VLAN_CAPABILITIES(ifp); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_XLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_XUNLOCK(sc); if_rele(tpif); break; } error = lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ lagg_stop(sc); LAGG_XUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ LAGG_XUNLOCK(sc); (*ifp->if_init)(sc); } else LAGG_XUNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_clrmulti(lp); lagg_setmulti(lp); } LAGG_XUNLOCK(sc); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: if (ifp->if_type == IFT_INFINIBAND) error = EINVAL; else error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: case SIOCSIFCAPNV: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(ifp); error = 0; break; case SIOCGIFCAPNV: error = 0; break; case SIOCSIFMTU: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); else error = EINVAL; if (error != 0) { if_printf(ifp, "failed to change MTU to %d on port %s, " "reverting all ports to original MTU (%d)\n", ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu); break; } } if (error == 0) { ifp->if_mtu = ifr->ifr_mtu; } else { /* set every port back to the original MTU */ ifr->ifr_mtu = ifp->if_mtu; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } } lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(ifp); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) #ifdef RATELIMIT static const struct if_snd_tag_sw lagg_snd_tag_ul_sw = { .snd_tag_modify = lagg_snd_tag_modify, .snd_tag_query = lagg_snd_tag_query, .snd_tag_free = lagg_snd_tag_free, .next_snd_tag = lagg_next_snd_tag, .type = IF_SND_TAG_TYPE_UNLIMITED }; static const struct if_snd_tag_sw lagg_snd_tag_rl_sw = { .snd_tag_modify = lagg_snd_tag_modify, .snd_tag_query = lagg_snd_tag_query, .snd_tag_free = lagg_snd_tag_free, .next_snd_tag = lagg_next_snd_tag, .type = IF_SND_TAG_TYPE_RATE_LIMIT }; #endif #ifdef KERN_TLS static const struct if_snd_tag_sw lagg_snd_tag_tls_sw = { .snd_tag_modify = lagg_snd_tag_modify, .snd_tag_query = lagg_snd_tag_query, .snd_tag_free = lagg_snd_tag_free, .next_snd_tag = lagg_next_snd_tag, .type = IF_SND_TAG_TYPE_TLS }; #ifdef RATELIMIT static const struct if_snd_tag_sw lagg_snd_tag_tls_rl_sw = { .snd_tag_modify = lagg_snd_tag_modify, .snd_tag_query = lagg_snd_tag_query, .snd_tag_free = lagg_snd_tag_free, .next_snd_tag = lagg_next_snd_tag, .type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT }; #endif #endif static inline struct lagg_snd_tag * mst_to_lst(struct m_snd_tag *mst) { return (__containerof(mst, struct lagg_snd_tag, com)); } /* * Look up the port used by a specific flow. This only works for lagg * protocols with deterministic port mappings (e.g. not roundrobin). * In addition protocols which use a hash to map flows to ports must * be configured to use the mbuf flowid rather than hashing packet * contents. */ static struct lagg_port * lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype, uint8_t numa_domain) { struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t hash, p; int err; sc = ifp->if_softc; switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: return (lagg_link_active(sc, sc->sc_primary)); case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || flowtype == M_HASHTYPE_NONE) return (NULL); p = flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; return (lagg_link_active(sc, lp)); case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || flowtype == M_HASHTYPE_NONE) return (NULL); hash = flowid >> sc->flowid_shift; return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, &err)); default: return (NULL); } } static int lagg_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct epoch_tracker et; const struct if_snd_tag_sw *sw; struct lagg_snd_tag *lst; struct lagg_port *lp; struct ifnet *lp_ifp; struct m_snd_tag *mst; int error; switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_UNLIMITED: sw = &lagg_snd_tag_ul_sw; break; case IF_SND_TAG_TYPE_RATE_LIMIT: sw = &lagg_snd_tag_rl_sw; break; #endif #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: sw = &lagg_snd_tag_tls_sw; break; case IF_SND_TAG_TYPE_TLS_RX: /* Return tag from port interface directly. */ sw = NULL; break; #ifdef RATELIMIT case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: sw = &lagg_snd_tag_tls_rl_sw; break; #endif #endif default: return (EOPNOTSUPP); } NET_EPOCH_ENTER(et); lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype, params->hdr.numa_domain); if (lp == NULL) { NET_EPOCH_EXIT(et); return (EOPNOTSUPP); } if (lp->lp_ifp == NULL) { NET_EPOCH_EXIT(et); return (EOPNOTSUPP); } lp_ifp = lp->lp_ifp; if_ref(lp_ifp); NET_EPOCH_EXIT(et); if (sw != NULL) { lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT); if (lst == NULL) { if_rele(lp_ifp); return (ENOMEM); } } else lst = NULL; error = m_snd_tag_alloc(lp_ifp, params, &mst); if_rele(lp_ifp); if (error) { free(lst, M_LAGG); return (error); } if (sw != NULL) { m_snd_tag_init(&lst->com, ifp, sw); lst->tag = mst; *ppmt = &lst->com; } else *ppmt = mst; return (0); } static struct m_snd_tag * lagg_next_snd_tag(struct m_snd_tag *mst) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); return (lst->tag); } static int lagg_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); return (lst->tag->sw->snd_tag_modify(lst->tag, params)); } static int lagg_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); return (lst->tag->sw->snd_tag_query(lst->tag, params)); } static void lagg_snd_tag_free(struct m_snd_tag *mst) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); m_snd_tag_rele(lst->tag); free(lst, M_LAGG); } static void lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) { /* * For lagg, we have an indirect * interface. The caller needs to * get a ratelimit tag on the actual * interface the flow will go on. */ q->rate_table = NULL; q->flags = RT_IS_INDIRECT; q->max_flows = 0; q->number_of_rates = 0; } #endif static int lagg_setmulti(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; IF_ADDR_WLOCK(scifp); CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } return (0); } static int lagg_clrmulti(struct lagg_port *lp) { struct lagg_mc *mc; LAGG_XLOCK_ASSERT(lp->lp_softc); while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); free(mc, M_LAGG); } return (0); } static void lagg_setcaps(struct lagg_port *lp, int cap, int cap2) { struct ifreq ifr; struct siocsifcapnv_driver_data drv_ioctl_data; if (lp->lp_ifp->if_capenable == cap && lp->lp_ifp->if_capenable2 == cap2) return; if (lp->lp_ioctl == NULL) return; /* XXX */ if ((lp->lp_ifp->if_capabilities & IFCAP_NV) != 0) { drv_ioctl_data.reqcap = cap; drv_ioctl_data.reqcap2 = cap2; drv_ioctl_data.nvcap = NULL; (*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAPNV, (caddr_t)&drv_ioctl_data); } else { ifr.ifr_reqcap = cap; (*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr); } } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_XLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit_ethernet(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) MPASS(m->m_pkthdr.snd_tag->ifp == ifp); #endif NET_EPOCH_ENTER(et); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { NET_EPOCH_EXIT(et); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); NET_EPOCH_EXIT(et); return (error); } static int lagg_transmit_infiniband(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) MPASS(m->m_pkthdr.snd_tag->ifp == ifp); #endif NET_EPOCH_ENTER(et); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { NET_EPOCH_EXIT(et); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } INFINIBAND_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); NET_EPOCH_EXIT(et); return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input_ethernet(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; NET_EPOCH_ENTER(et); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || lp->lp_detaching != 0 || sc->sc_proto == LAGG_PROTO_NONE) { NET_EPOCH_EXIT(et); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); m = lagg_proto_input(sc, lp, m); if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { m_freem(m); m = NULL; } NET_EPOCH_EXIT(et); return (m); } static struct mbuf * lagg_input_infiniband(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; NET_EPOCH_ENTER(et); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || lp->lp_detaching != 0 || sc->sc_proto == LAGG_PROTO_NONE) { NET_EPOCH_EXIT(et); m_freem(m); return (NULL); } INFINIBAND_BPF_MTAP(scifp, m); m = lagg_proto_input(sc, lp, m); if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { m_freem(m); m = NULL; } NET_EPOCH_EXIT(et); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct epoch_tracker et; struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; NET_EPOCH_ENTER(et); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } NET_EPOCH_EXIT(et); } static void lagg_linkstate(struct lagg_softc *sc) { struct epoch_tracker et; struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; LAGG_XLOCK_ASSERT(sc); /* LACP handles link state itself */ if (sc->sc_proto == LAGG_PROTO_LACP) return; /* Our link is considered up if at least one of our ports is active */ NET_EPOCH_ENTER(et); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } NET_EPOCH_EXIT(et); if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: speed = 0; NET_EPOCH_ENTER(et); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; NET_EPOCH_EXIT(et); sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_XLOCK(sc); lagg_linkstate(sc); lagg_proto_linkstate(sc, lp); LAGG_XUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; /* * Search a port which reports an active link state. */ #ifdef INVARIANTS /* * This is called with either in the network epoch * or with LAGG_XLOCK(sc) held. */ if (!in_epoch(net_epoch_preempt)) LAGG_XLOCK_ASSERT(sc); #endif if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { return (lp_next); } } found: return (rval); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct lagg_snd_tag *lst; struct m_snd_tag *mst; mst = m->m_pkthdr.snd_tag; lst = mst_to_lst(mst); if (lst->tag->ifp != ifp) { m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag); m_snd_tag_rele(mst); } #endif return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_seq = 0; sc->sc_stride = 1; } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; p = atomic_fetchadd_32(&sc->sc_seq, 1); p /= sc->sc_stride; p %= sc->sc_count; lp = CK_SLIST_FIRST(&sc->sc_ports); while (p--) lp = CK_SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Broadcast mode */ static int lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { int errors = 0; int ret; struct lagg_port *lp, *last = NULL; struct mbuf *m0; NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; if (last != NULL) { m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m0 == NULL) { ret = ENOBUFS; errors++; break; } lagg_enqueue(last->lp_ifp, m0); } last = lp; } if (last == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { errors++; if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors); m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); if (errors != 0) if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors); return (ret); } static struct mbuf* lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've received a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); } static void lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb; lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) free(lb, M_LAGG); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0, rv; rv = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) { rv = EINVAL; break; } if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } return (rv); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; lacp_attach(sc); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; void *psc; LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); psc = sc->sc_psc; sc->sc_psc = NULL; lacp_detach(psc); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_SXLOCK_ASSERT(sc); /* purge all the lacp ports */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; int err; lp = lacp_select_tx_port(sc, m, &err); if (lp == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (err); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c index 4f0b236f7f5e..f4d34c46f9f0 100644 --- a/sys/net/if_loop.c +++ b/sys/net/if_loop.c @@ -1,454 +1,460 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ /* * Loopback interface driver for protocol testing and timing. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef INET6 #ifndef INET #include #endif #include #include #endif #include #ifdef TINY_LOMTU #define LOMTU (1024+512) #elif defined(LARGE_LOMTU) #define LOMTU 131072 #else #define LOMTU 16384 #endif #define LO_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP) #define LO_CSUM_FEATURES6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6) #define LO_CSUM_SET (CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \ CSUM_PSEUDO_HDR | \ CSUM_IP_CHECKED | CSUM_IP_VALID | \ CSUM_SCTP_VALID) static int loioctl(struct ifnet *, u_long, caddr_t); static int looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); -static int lo_clone_create(struct if_clone *, int, caddr_t); -static void lo_clone_destroy(struct ifnet *); VNET_DEFINE(struct ifnet *, loif); /* Used externally */ #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, lo_cloner); #define V_lo_cloner VNET(lo_cloner) #endif static struct if_clone *lo_cloner; static const char loname[] = "lo"; -static void -lo_clone_destroy(struct ifnet *ifp) +static int +lo_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { + if (ifp->if_dunit == 0 && (flags & IFC_F_FORCE) == 0) + return (EINVAL); #ifndef VIMAGE /* XXX: destroying lo0 will lead to panics. */ KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); #endif bpfdetach(ifp); if_detach(ifp); if_free(ifp); + + return (0); } static int -lo_clone_create(struct if_clone *ifc, int unit, caddr_t params) +lo_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct ifnet *ifp; ifp = if_alloc(IFT_LOOP); if (ifp == NULL) return (ENOSPC); - if_initname(ifp, loname, unit); + if_initname(ifp, loname, ifd->unit); ifp->if_mtu = LOMTU; ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; ifp->if_ioctl = loioctl; ifp->if_output = looutput; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_capabilities = ifp->if_capenable = IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 | IFCAP_LINKSTATE; ifp->if_hwassist = LO_CSUM_FEATURES | LO_CSUM_FEATURES6; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); if (V_loif == NULL) V_loif = ifp; + *ifpp = ifp; return (0); } static void vnet_loif_init(const void *unused __unused) { - + struct if_clone_addreq req = { + .create_f = lo_clone_create, + .destroy_f = lo_clone_destroy, + .flags = IFC_F_AUTOUNIT, + }; + lo_cloner = ifc_attach_cloner(loname, &req); #ifdef VIMAGE - lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy, - 1); V_lo_cloner = lo_cloner; -#else - lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy, - 1); #endif + struct ifc_data ifd = { .unit = 0 }; + ifc_create_ifp(loname, &ifd, NULL); } VNET_SYSINIT(vnet_loif_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_loif_init, NULL); #ifdef VIMAGE static void vnet_loif_uninit(const void *unused __unused) { - if_clone_detach(V_lo_cloner); + ifc_detach_cloner(V_lo_cloner); V_loif = NULL; } VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_loif_uninit, NULL); #endif static int loop_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: break; case MOD_UNLOAD: printf("loop module unload - not possible for this module type\n"); return (EINVAL); default: return (EOPNOTSUPP); } return (0); } static moduledata_t loop_mod = { "if_lo", loop_modevent, 0 }; DECLARE_MODULE(if_lo, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); static int looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int32_t af; #ifdef MAC int error; #endif M_ASSERTPKTHDR(m); /* check if we have the packet header */ #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); return (error); } #endif if (ro != NULL && ro->ro_flags & (RT_REJECT|RT_BLACKHOLE)) { m_freem(m); return (ro->ro_flags & RT_BLACKHOLE ? 0 : EHOSTUNREACH); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); #ifdef RSS M_HASHTYPE_CLEAR(m); #endif /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); #if 1 /* XXX */ switch (af) { case AF_INET: if (ifp->if_capenable & IFCAP_RXCSUM) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = LO_CSUM_SET; } m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES; break; case AF_INET6: #if 0 /* * XXX-BZ for now always claim the checksum is good despite * any interface flags. This is a workaround for 9.1-R and * a proper solution ought to be sought later. */ if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) { m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = LO_CSUM_SET; } #else m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = LO_CSUM_SET; #endif m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES6; break; default: printf("looutput: af=%d unexpected\n", af); m_freem(m); return (EAFNOSUPPORT); } #endif return (if_simloop(ifp, m, af, 0)); } /* * if_simloop() * * This function is to support software emulation of hardware loopback, * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't * hear their own broadcasts, we create a copy of the packet that we * would normally receive via a hardware loopback. * * This function expects the packet to include the media header of length hlen. */ int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen) { int isr; M_ASSERTPKTHDR(m); m_tag_delete_nonpersistent(m); m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * Let BPF see incoming packet in the following manner: * - Emulated packet loopback for a simplex interface * (net/if_ethersubr.c) * -> passes it to ifp's BPF * - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c) * -> not passes it to any BPF * - Normal packet loopback from myself to myself (net/if_loop.c) * -> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0) */ if (hlen > 0) { if (bpf_peers_present(ifp->if_bpf)) { bpf_mtap(ifp->if_bpf, m); } } else { if (bpf_peers_present(V_loif->if_bpf)) { if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) { /* XXX beware sizeof(af) != 4 */ u_int32_t af1 = af; /* * We need to prepend the address family. */ bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m); } } } /* Strip away media header */ if (hlen > 0) { m_adj(m, hlen); #ifndef __NO_STRICT_ALIGNMENT /* * Some archs do not like unaligned data, so * we move data down in the first mbuf. */ if (mtod(m, vm_offset_t) & 3) { KASSERT(hlen >= 3, ("if_simloop: hlen too small")); bcopy(m->m_data, (char *)(mtod(m, vm_offset_t) - (mtod(m, vm_offset_t) & 3)), m->m_len); m->m_data -= (mtod(m,vm_offset_t) & 3); } #endif } /* Deliver to upper layer protocol */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: m->m_flags |= M_LOOP; isr = NETISR_IPV6; break; #endif default: printf("if_simloop: can't handle af=%d\n", af); m_freem(m); return (EAFNOSUPPORT); } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); netisr_queue(isr, m); /* mbuf is free'd on failure. */ return (0); } /* * Process an ioctl request. */ static int loioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; int error = 0, mask; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_UP); /* * Everything else is done at a higher level. */ break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifr->ifr_addr.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; case SIOCSIFFLAGS: if_link_state_change(ifp, (ifp->if_flags & IFF_UP) ? LINK_STATE_UP: LINK_STATE_DOWN); break; case SIOCSIFCAP: mask = ifp->if_capenable ^ ifr->ifr_reqcap; if ((mask & IFCAP_RXCSUM) != 0) ifp->if_capenable ^= IFCAP_RXCSUM; if ((mask & IFCAP_TXCSUM) != 0) ifp->if_capenable ^= IFCAP_TXCSUM; if ((mask & IFCAP_RXCSUM_IPV6) != 0) { #if 0 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; #else error = EOPNOTSUPP; break; #endif } if ((mask & IFCAP_TXCSUM_IPV6) != 0) { #if 0 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; #else error = EOPNOTSUPP; break; #endif } ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist = LO_CSUM_FEATURES; #if 0 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= LO_CSUM_FEATURES6; #endif break; default: error = EINVAL; } return (error); } diff --git a/sys/net/if_ovpn.c b/sys/net/if_ovpn.c index 82e6dd4f6eee..286125fb42d5 100644 --- a/sys/net/if_ovpn.c +++ b/sys/net/if_ovpn.c @@ -1,2470 +1,2476 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021-2022 Rubicon Communications, LLC (Netgate) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "if_ovpn.h" struct ovpn_kkey_dir { int refcount; uint8_t key[32]; uint8_t keylen; uint8_t nonce[8]; uint8_t noncelen; enum ovpn_key_cipher cipher; crypto_session_t cryptoid; struct mtx replay_mtx; /* * Last seen gapless sequence number. New rx seq numbers must be * strictly higher than this. */ uint32_t rx_seq; /* Seen packets, relative to rx_seq. bit(0) will always be 0. */ uint64_t rx_window; }; struct ovpn_kkey { struct ovpn_kkey_dir *encrypt; struct ovpn_kkey_dir *decrypt; uint8_t keyid; uint32_t peerid; }; struct ovpn_keepalive { uint32_t interval; uint32_t timeout; }; struct ovpn_wire_header { uint32_t opcode; /* opcode, key id, peer id */ uint32_t seq; uint8_t auth_tag[16]; }; struct ovpn_notification { enum ovpn_notif_type type; uint32_t peerid; }; struct ovpn_softc; struct ovpn_kpeer { int refcount; uint32_t peerid; struct ovpn_softc *sc; struct sockaddr_storage local; struct sockaddr_storage remote; struct in_addr vpn4; struct in6_addr vpn6; struct ovpn_kkey keys[2]; uint32_t tx_seq; struct ovpn_keepalive keepalive; uint32_t *last_active; struct callout ping_send; struct callout ping_rcv; }; #define OVPN_MAX_PEERS 128 struct ovpn_counters { uint64_t lost_ctrl_pkts_in; uint64_t lost_ctrl_pkts_out; uint64_t lost_data_pkts_in; uint64_t lost_data_pkts_out; uint64_t nomem_data_pkts_in; uint64_t nomem_data_pkts_out; uint64_t received_ctrl_pkts; uint64_t received_data_pkts; uint64_t sent_ctrl_pkts; uint64_t sent_data_pkts; uint64_t transport_bytes_sent; uint64_t transport_bytes_received; uint64_t tunnel_bytes_sent; uint64_t tunnel_bytes_received; }; #define OVPN_COUNTER_SIZE (sizeof(struct ovpn_counters)/sizeof(uint64_t)) struct ovpn_softc { int refcount; struct rmlock lock; struct ifnet *ifp; struct socket *so; int peercount; struct ovpn_kpeer *peers[OVPN_MAX_PEERS]; /* XXX Hard limit for now? */ /* Pending packets */ struct buf_ring *rxring; struct buf_ring *notifring; counter_u64_t counters[OVPN_COUNTER_SIZE]; struct epoch_context epoch_ctx; }; static struct ovpn_kpeer *ovpn_find_peer(struct ovpn_softc *, uint32_t); static bool ovpn_udp_input(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); static int ovpn_transmit_to_peer(struct ifnet *, struct mbuf *, struct ovpn_kpeer *, struct rm_priotracker *); static int ovpn_encap(struct ovpn_softc *, uint32_t, struct mbuf *); static int ovpn_get_af(struct mbuf *); static void ovpn_free_kkey_dir(struct ovpn_kkey_dir *); static bool ovpn_check_replay(struct ovpn_kkey_dir *, uint32_t); #define OVPN_MTU_MIN 576 #define OVPN_MTU_MAX (IP_MAXPACKET - sizeof(struct ip) - \ sizeof(struct udphdr) - sizeof(struct ovpn_wire_header)) #define OVPN_OP_DATA_V2 0x09 #define OVPN_OP_SHIFT 3 VNET_DEFINE_STATIC(struct if_clone *, ovpn_cloner); #define V_ovpn_cloner VNET(ovpn_cloner) #define OVPN_RLOCK_TRACKER struct rm_priotracker _ovpn_lock_tracker; \ struct rm_priotracker *_ovpn_lock_trackerp = &_ovpn_lock_tracker #define OVPN_RLOCK(sc) rm_rlock(&(sc)->lock, _ovpn_lock_trackerp) #define OVPN_RUNLOCK(sc) rm_runlock(&(sc)->lock, _ovpn_lock_trackerp) #define OVPN_WLOCK(sc) rm_wlock(&(sc)->lock) #define OVPN_WUNLOCK(sc) rm_wunlock(&(sc)->lock) #define OVPN_ASSERT(sc) rm_assert(&(sc)->lock, RA_LOCKED) #define OVPN_RASSERT(sc) rm_assert(&(sc)->lock, RA_RLOCKED) #define OVPN_WASSERT(sc) rm_assert(&(sc)->lock, RA_WLOCKED) #define OVPN_UNLOCK_ASSERT(sc) rm_assert(&(sc)->lock, RA_UNLOCKED) #define OVPN_COUNTER_ADD(sc, name, val) \ counter_u64_add(sc->counters[offsetof(struct ovpn_counters, name) / \ sizeof(uint64_t)], val) #define TO_IN(x) ((struct sockaddr_in *)(x)) #define TO_IN6(x) ((struct sockaddr_in6 *)(x)) SDT_PROVIDER_DEFINE(if_ovpn); SDT_PROBE_DEFINE1(if_ovpn, tx, transmit, start, "struct mbuf *"); SDT_PROBE_DEFINE2(if_ovpn, tx, route, ip4, "struct in_addr *", "struct ovpn_kpeer *"); SDT_PROBE_DEFINE2(if_ovpn, tx, route, ip6, "struct in6_addr *", "struct ovpn_kpeer *"); static const char ovpnname[] = "ovpn"; static const char ovpngroupname[] = "openvpn"; static MALLOC_DEFINE(M_OVPN, ovpnname, "OpenVPN DCO Interface"); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_OTHER, openvpn, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "OpenVPN DCO Interface"); VNET_DEFINE_STATIC(int, replay_protection) = 0; #define V_replay_protection VNET(replay_protection) SYSCTL_INT(_net_link_openvpn, OID_AUTO, replay_protection, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(replay_protection), 0, "Validate sequence numbers"); static struct ovpn_kpeer * ovpn_find_peer(struct ovpn_softc *sc, uint32_t peerid) { struct ovpn_kpeer *p = NULL; OVPN_ASSERT(sc); for (int i = 0; i < OVPN_MAX_PEERS; i++) { p = sc->peers[i]; if (p == NULL) continue; if (p->peerid == peerid) { MPASS(p->sc == sc); break; } } return (p); } static struct ovpn_kpeer * ovpn_find_only_peer(struct ovpn_softc *sc) { OVPN_ASSERT(sc); for (int i = 0; i < OVPN_MAX_PEERS; i++) { if (sc->peers[i] == NULL) continue; return (sc->peers[i]); } MPASS(false); return (NULL); } static uint16_t ovpn_get_port(struct sockaddr_storage *s) { switch (s->ss_family) { case AF_INET: { struct sockaddr_in *in = (struct sockaddr_in *)s; return (in->sin_port); } case AF_INET6: { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)s; return (in6->sin6_port); } default: panic("Unsupported address family %d", s->ss_family); } } static int ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa) { int af; if (! nvlist_exists_number(nvl, "af")) return (EINVAL); if (! nvlist_exists_binary(nvl, "address")) return (EINVAL); if (! nvlist_exists_number(nvl, "port")) return (EINVAL); af = nvlist_get_number(nvl, "af"); switch (af) { #ifdef INET case AF_INET: { struct sockaddr_in *in = (struct sockaddr_in *)sa; size_t len; const void *addr = nvlist_get_binary(nvl, "address", &len); in->sin_family = af; if (len != sizeof(in->sin_addr)) return (EINVAL); memcpy(&in->sin_addr, addr, sizeof(in->sin_addr)); in->sin_port = nvlist_get_number(nvl, "port"); break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)sa; size_t len; const void *addr = nvlist_get_binary(nvl, "address", &len); in6->sin6_family = af; if (len != sizeof(in6->sin6_addr)) return (EINVAL); memcpy(&in6->sin6_addr, addr, sizeof(in6->sin6_addr)); in6->sin6_port = nvlist_get_number(nvl, "port"); break; } #endif default: return (EINVAL); } return (0); } static bool ovpn_has_peers(struct ovpn_softc *sc) { OVPN_ASSERT(sc); return (sc->peercount > 0); } static void ovpn_rele_so(struct ovpn_softc *sc, struct ovpn_kpeer *peer) { bool has_peers; OVPN_WASSERT(sc); if (sc->so == NULL) return; has_peers = ovpn_has_peers(sc); /* Only remove the tunnel function if we're releasing the socket for * the last peer. */ if (! has_peers) (void)udp_set_kernel_tunneling(sc->so, NULL, NULL, NULL); sorele(sc->so); if (! has_peers) sc->so = NULL; } static void ovpn_notify_del_peer(struct ovpn_softc *sc, struct ovpn_kpeer *peer) { struct ovpn_notification *n; OVPN_WASSERT(sc); n = malloc(sizeof(*n), M_OVPN, M_NOWAIT); if (n == NULL) return; n->peerid = peer->peerid; n->type = OVPN_NOTIF_DEL_PEER; if (buf_ring_enqueue(sc->notifring, n) != 0) { free(n, M_OVPN); } else if (sc->so != NULL) { /* Wake up userspace */ sc->so->so_error = EAGAIN; sorwakeup(sc->so); sowwakeup(sc->so); } } static void ovpn_peer_release_ref(struct ovpn_kpeer *peer, bool locked) { struct ovpn_softc *sc; atomic_add_int(&peer->refcount, -1); if (atomic_load_int(&peer->refcount) > 0) return; sc = peer->sc; if (! locked) { OVPN_WLOCK(sc); /* Might have changed before we acquired the lock. */ if (atomic_load_int(&peer->refcount) > 0) { OVPN_WUNLOCK(sc); return; } } /* The peer should have been removed from the list already. */ MPASS(ovpn_find_peer(sc, peer->peerid) == NULL); ovpn_notify_del_peer(sc, peer); for (int i = 0; i < 2; i++) { ovpn_free_kkey_dir(peer->keys[i].encrypt); ovpn_free_kkey_dir(peer->keys[i].decrypt); } ovpn_rele_so(sc, peer); callout_stop(&peer->ping_send); callout_stop(&peer->ping_rcv); uma_zfree_pcpu(pcpu_zone_4, peer->last_active); free(peer, M_OVPN); if (! locked) OVPN_WUNLOCK(sc); } static int ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl) { #ifdef INET6 struct epoch_tracker et; #endif struct sockaddr_storage remote; struct ovpn_kpeer *peer = NULL; struct file *fp = NULL; struct sockaddr *name = NULL; struct ovpn_softc *sc = ifp->if_softc; struct thread *td = curthread; struct socket *so = NULL; int fd; uint32_t peerid; int ret = 0, i; if (nvl == NULL) return (EINVAL); if (! nvlist_exists_number(nvl, "peerid")) return (EINVAL); if (! nvlist_exists_number(nvl, "fd")) return (EINVAL); if (! nvlist_exists_nvlist(nvl, "remote")) return (EINVAL); peerid = nvlist_get_number(nvl, "peerid"); ret = ovpn_nvlist_to_sockaddr(nvlist_get_nvlist(nvl, "remote"), &remote); if (ret != 0) return (ret); fd = nvlist_get_number(nvl, "fd"); /* Look up the userspace process and use the fd to find the socket. */ ret = getsock(td, fd, &cap_connect_rights, &fp); if (ret != 0) return (ret); so = fp->f_data; peer = malloc(sizeof(*peer), M_OVPN, M_WAITOK | M_ZERO); peer->peerid = peerid; peer->sc = sc; peer->tx_seq = 1; peer->refcount = 1; peer->last_active = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO); if (nvlist_exists_binary(nvl, "vpn_ipv4")) { size_t len; const void *addr = nvlist_get_binary(nvl, "vpn_ipv4", &len); if (len != sizeof(peer->vpn4)) { ret = EINVAL; goto error; } memcpy(&peer->vpn4, addr, len); } if (nvlist_exists_binary(nvl, "vpn_ipv6")) { size_t len; const void *addr = nvlist_get_binary(nvl, "vpn_ipv6", &len); if (len != sizeof(peer->vpn6)) { ret = EINVAL; goto error; } memcpy(&peer->vpn6, addr, len); } callout_init_rm(&peer->ping_send, &sc->lock, CALLOUT_SHAREDLOCK); callout_init_rm(&peer->ping_rcv, &sc->lock, 0); ret = so->so_proto->pr_sockaddr(so, &name); if (ret) goto error; if (ovpn_get_port((struct sockaddr_storage *)name) == 0) { ret = EINVAL; goto error; } if (name->sa_family != remote.ss_family) { ret = EINVAL; goto error; } memcpy(&peer->local, name, name->sa_len); memcpy(&peer->remote, &remote, sizeof(remote)); free(name, M_SONAME); name = NULL; if (peer->local.ss_family == AF_INET6 && IN6_IS_ADDR_V4MAPPED(&TO_IN6(&peer->remote)->sin6_addr)) { /* V4 mapped address, so treat this as v4, not v6. */ in6_sin6_2_sin_in_sock((struct sockaddr *)&peer->local); in6_sin6_2_sin_in_sock((struct sockaddr *)&peer->remote); } #ifdef INET6 if (peer->local.ss_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&TO_IN6(&peer->local)->sin6_addr)) { NET_EPOCH_ENTER(et); ret = in6_selectsrc_addr(curthread->td_proc->p_fibnum, &TO_IN6(&peer->remote)->sin6_addr, 0, NULL, &TO_IN6(&peer->local)->sin6_addr, NULL); NET_EPOCH_EXIT(et); if (ret != 0) { goto error; } } #endif OVPN_WLOCK(sc); /* Disallow peer id re-use. */ if (ovpn_find_peer(sc, peerid) != NULL) { ret = EEXIST; goto error_locked; } /* Make sure this is really a UDP socket. */ if (so->so_type != SOCK_DGRAM || so->so_proto->pr_type != SOCK_DGRAM) { ret = EPROTOTYPE; goto error_locked; } /* Must be the same socket as for other peers on this interface. */ if (sc->so != NULL && so != sc->so) goto error_locked; if (sc->so == NULL) sc->so = so; /* Insert the peer into the list. */ for (i = 0; i < OVPN_MAX_PEERS; i++) { if (sc->peers[i] != NULL) continue; MPASS(sc->peers[i] == NULL); sc->peers[i] = peer; sc->peercount++; soref(sc->so); break; } if (i == OVPN_MAX_PEERS) { ret = ENOSPC; goto error_locked; } ret = udp_set_kernel_tunneling(sc->so, ovpn_udp_input, NULL, sc); if (ret == EBUSY) { /* Fine, another peer already set the input function. */ ret = 0; } if (ret != 0) { sc->peers[i] = NULL; sc->peercount--; goto error_locked; } OVPN_WUNLOCK(sc); goto done; error_locked: OVPN_WUNLOCK(sc); error: free(name, M_SONAME); uma_zfree_pcpu(pcpu_zone_4, peer->last_active); free(peer, M_OVPN); done: if (fp != NULL) fdrop(fp, td); return (ret); } static int _ovpn_del_peer(struct ovpn_softc *sc, uint32_t peerid) { struct ovpn_kpeer *peer; int i; OVPN_WASSERT(sc); for (i = 0; i < OVPN_MAX_PEERS; i++) { if (sc->peers[i] == NULL) continue; if (sc->peers[i]->peerid != peerid) continue; peer = sc->peers[i]; break; } if (i == OVPN_MAX_PEERS) return (ENOENT); sc->peers[i] = NULL; sc->peercount--; ovpn_peer_release_ref(peer, true); return (0); } static int ovpn_del_peer(struct ifnet *ifp, nvlist_t *nvl) { struct ovpn_softc *sc = ifp->if_softc; uint32_t peerid; int ret; OVPN_WASSERT(sc); if (nvl == NULL) return (EINVAL); if (! nvlist_exists_number(nvl, "peerid")) return (EINVAL); peerid = nvlist_get_number(nvl, "peerid"); ret = _ovpn_del_peer(sc, peerid); return (ret); } static int ovpn_create_kkey_dir(struct ovpn_kkey_dir **kdirp, const nvlist_t *nvl) { struct crypto_session_params csp; struct ovpn_kkey_dir *kdir; const char *ciphername; enum ovpn_key_cipher cipher; const void *key, *iv; size_t keylen = 0, ivlen = 0; int error; if (! nvlist_exists_string(nvl, "cipher")) return (EINVAL); ciphername = nvlist_get_string(nvl, "cipher"); if (strcmp(ciphername, "none") == 0) cipher = OVPN_CIPHER_ALG_NONE; else if (strcmp(ciphername, "AES-256-GCM") == 0) cipher = OVPN_CIPHER_ALG_AES_GCM; else if (strcmp(ciphername, "CHACHA20-POLY1305") == 0) cipher = OVPN_CIPHER_ALG_CHACHA20_POLY1305; else return (EINVAL); if (cipher != OVPN_CIPHER_ALG_NONE) { if (! nvlist_exists_binary(nvl, "key")) return (EINVAL); key = nvlist_get_binary(nvl, "key", &keylen); if (keylen > sizeof(kdir->key)) return (E2BIG); if (! nvlist_exists_binary(nvl, "iv")) return (EINVAL); iv = nvlist_get_binary(nvl, "iv", &ivlen); if (ivlen != 8) return (E2BIG); } kdir = malloc(sizeof(struct ovpn_kkey_dir), M_OVPN, M_WAITOK | M_ZERO); kdir->cipher = cipher; kdir->keylen = keylen; memcpy(kdir->key, key, keylen); kdir->noncelen = ivlen; memcpy(kdir->nonce, iv, ivlen); if (kdir->cipher != OVPN_CIPHER_ALG_NONE) { /* Crypto init */ bzero(&csp, sizeof(csp)); csp.csp_mode = CSP_MODE_AEAD; if (kdir->cipher == OVPN_CIPHER_ALG_CHACHA20_POLY1305) csp.csp_cipher_alg = CRYPTO_CHACHA20_POLY1305; else csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16; csp.csp_flags |= CSP_F_SEPARATE_AAD; csp.csp_cipher_klen = kdir->keylen; csp.csp_cipher_key = kdir->key; csp.csp_ivlen = 96 / 8; error = crypto_newsession(&kdir->cryptoid, &csp, CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); if (error) { free(kdir, M_OVPN); return (error); } } mtx_init(&kdir->replay_mtx, "if_ovpn rx replay", NULL, MTX_DEF); *kdirp = kdir; return (0); } static void ovpn_free_kkey_dir(struct ovpn_kkey_dir *kdir) { if (kdir == NULL) return; mtx_destroy(&kdir->replay_mtx); crypto_freesession(kdir->cryptoid); free(kdir, M_OVPN); } static int ovpn_set_key(struct ifnet *ifp, const nvlist_t *nvl) { struct ovpn_softc *sc = ifp->if_softc; struct ovpn_kkey_dir *enc, *dec; struct ovpn_kpeer *peer; int slot, keyid, peerid; int error; if (nvl == NULL) return (EINVAL); if (! nvlist_exists_number(nvl, "slot")) return (EINVAL); slot = nvlist_get_number(nvl, "slot"); if (! nvlist_exists_number(nvl, "keyid")) return (EINVAL); keyid = nvlist_get_number(nvl, "keyid"); if (! nvlist_exists_number(nvl, "peerid")) return (EINVAL); peerid = nvlist_get_number(nvl, "peerid"); if (slot != OVPN_KEY_SLOT_PRIMARY && slot != OVPN_KEY_SLOT_SECONDARY) return (EINVAL); if (! nvlist_exists_nvlist(nvl, "encrypt") || ! nvlist_exists_nvlist(nvl, "decrypt")) return (EINVAL); error = ovpn_create_kkey_dir(&enc, nvlist_get_nvlist(nvl, "encrypt")); if (error) return (error); error = ovpn_create_kkey_dir(&dec, nvlist_get_nvlist(nvl, "decrypt")); if (error) { ovpn_free_kkey_dir(enc); return (error); } OVPN_WLOCK(sc); peer = ovpn_find_peer(sc, peerid); if (peer == NULL) { ovpn_free_kkey_dir(dec); ovpn_free_kkey_dir(enc); OVPN_WUNLOCK(sc); return (ENOENT); } ovpn_free_kkey_dir(peer->keys[slot].encrypt); ovpn_free_kkey_dir(peer->keys[slot].decrypt); peer->keys[slot].encrypt = enc; peer->keys[slot].decrypt = dec; peer->keys[slot].keyid = keyid; peer->keys[slot].peerid = peerid; OVPN_WUNLOCK(sc); return (0); } static int ovpn_check_key(struct ovpn_softc *sc, struct ovpn_kpeer *peer, enum ovpn_key_slot slot) { OVPN_ASSERT(sc); if (peer->keys[slot].encrypt == NULL) return (ENOLINK); if (peer->keys[slot].decrypt == NULL) return (ENOLINK); return (0); } static int ovpn_start(struct ifnet *ifp) { struct ovpn_softc *sc = ifp->if_softc; OVPN_WLOCK(sc); ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; if_link_state_change(ifp, LINK_STATE_UP); OVPN_WUNLOCK(sc); return (0); } static int ovpn_swap_keys(struct ifnet *ifp, nvlist_t *nvl) { struct ovpn_softc *sc = ifp->if_softc; struct ovpn_kpeer *peer; struct ovpn_kkey tmpkey; int error; if (nvl == NULL) return (EINVAL); if (! nvlist_exists_number(nvl, "peerid")) return (EINVAL); OVPN_WLOCK(sc); peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid")); if (peer == NULL) { OVPN_WUNLOCK(sc); return (ENOENT); } /* Check that we have a second key to swap to. */ error = ovpn_check_key(sc, peer, OVPN_KEY_SLOT_SECONDARY); if (error) { OVPN_WUNLOCK(sc); return (error); } tmpkey = peer->keys[0]; peer->keys[0] = peer->keys[1]; peer->keys[1] = tmpkey; OVPN_WUNLOCK(sc); return (0); } static int ovpn_del_key(struct ifnet *ifp, const nvlist_t *nvl) { enum ovpn_key_slot slot; struct ovpn_kpeer *peer; struct ovpn_softc *sc = ifp->if_softc; if (nvl == NULL) return (EINVAL); if (! nvlist_exists_number(nvl, "peerid")) return (EINVAL); if (! nvlist_exists_number(nvl, "slot")) return (EINVAL); slot = nvlist_get_number(nvl, "slot"); if (slot != OVPN_KEY_SLOT_PRIMARY && slot != OVPN_KEY_SLOT_SECONDARY) return (EINVAL); OVPN_WLOCK(sc); peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid")); if (peer == NULL) { OVPN_WUNLOCK(sc); return (ENOENT); } ovpn_free_kkey_dir(peer->keys[slot].encrypt); ovpn_free_kkey_dir(peer->keys[slot].decrypt); peer->keys[slot].encrypt = NULL; peer->keys[slot].decrypt = NULL; peer->keys[slot].keyid = 0; peer->keys[slot].peerid = 0; OVPN_WUNLOCK(sc); return (0); } static int ovpn_send_pkt(struct ifnet *ifp, const nvlist_t *nvl) { struct epoch_tracker et; struct ovpn_softc *sc = ifp->if_softc; struct mbuf *m; const uint8_t *pkt; size_t pktlen; uint32_t peerid; int ret; if (nvl == NULL) return (EINVAL); if (! nvlist_exists_binary(nvl, "packet")) return (EINVAL); pkt = nvlist_get_binary(nvl, "packet", &pktlen); if (! nvlist_exists_number(nvl, "peerid")) return (EINVAL); peerid = nvlist_get_number(nvl, "peerid"); /* * Check that userspace isn't giving us a data packet. That might lead * to IV re-use, which would be bad. */ if ((pkt[0] >> OVPN_OP_SHIFT) == OVPN_OP_DATA_V2) return (EINVAL); m = m_get2(pktlen, M_WAITOK, MT_DATA, M_PKTHDR); if (m == NULL) return (ENOMEM); m->m_len = m->m_pkthdr.len = pktlen; m_copyback(m, 0, m->m_len, pkt); /* Now prepend IP/UDP headers and transmit the mbuf. */ NET_EPOCH_ENTER(et); ret = ovpn_encap(sc, peerid, m); NET_EPOCH_EXIT(et); if (ret == 0) OVPN_COUNTER_ADD(sc, sent_ctrl_pkts, 1); else OVPN_COUNTER_ADD(sc, lost_ctrl_pkts_out, 1); return (ret); } static void ovpn_send_ping(void *arg) { static const uint8_t ping_str[] = { 0x2a, 0x18, 0x7b, 0xf3, 0x64, 0x1e, 0xb4, 0xcb, 0x07, 0xed, 0x2d, 0x0a, 0x98, 0x1f, 0xc7, 0x48 }; struct epoch_tracker et; struct ovpn_kpeer *peer = arg; struct ovpn_softc *sc = peer->sc; struct mbuf *m; OVPN_RASSERT(sc); /* Ensure we repeat! */ callout_reset(&peer->ping_send, peer->keepalive.interval * hz, ovpn_send_ping, peer); m = m_get2(sizeof(ping_str), M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) return; m_copyback(m, 0, sizeof(ping_str), ping_str); m->m_len = m->m_pkthdr.len = sizeof(ping_str); CURVNET_SET(sc->ifp->if_vnet); NET_EPOCH_ENTER(et); (void)ovpn_transmit_to_peer(sc->ifp, m, peer, NULL); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } static void ovpn_timeout(void *arg) { struct ovpn_kpeer *peer = arg; struct ovpn_softc *sc = peer->sc; uint32_t last, _last_active; int ret __diagused; int cpu; OVPN_WASSERT(sc); last = 0; CPU_FOREACH(cpu) { _last_active = *zpcpu_get_cpu(peer->last_active, cpu); if (_last_active > last) last = _last_active; } if (last + peer->keepalive.timeout > time_uptime) { callout_reset(&peer->ping_rcv, (peer->keepalive.timeout - (time_uptime - last)) * hz, ovpn_timeout, peer); return; } CURVNET_SET(sc->ifp->if_vnet); ret = _ovpn_del_peer(sc, peer->peerid); MPASS(ret == 0); CURVNET_RESTORE(); } static int ovpn_set_peer(struct ifnet *ifp, const nvlist_t *nvl) { struct ovpn_softc *sc = ifp->if_softc; struct ovpn_kpeer *peer; if (nvl == NULL) return (EINVAL); if (! nvlist_exists_number(nvl, "interval") || ! nvlist_exists_number(nvl, "timeout") || ! nvlist_exists_number(nvl, "peerid")) return (EINVAL); OVPN_WLOCK(sc); peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid")); if (peer == NULL) { OVPN_WUNLOCK(sc); return (ENOENT); } peer->keepalive.interval = nvlist_get_number(nvl, "interval"); peer->keepalive.timeout = nvlist_get_number(nvl, "timeout"); if (peer->keepalive.interval > 0) callout_reset(&peer->ping_send, peer->keepalive.interval * hz, ovpn_send_ping, peer); if (peer->keepalive.timeout > 0) callout_reset(&peer->ping_rcv, peer->keepalive.timeout * hz, ovpn_timeout, peer); OVPN_WUNLOCK(sc); return (0); } static int ovpn_ioctl_set(struct ifnet *ifp, struct ifdrv *ifd) { struct ovpn_softc *sc = ifp->if_softc; uint8_t *buf = NULL; nvlist_t *nvl = NULL; int ret; if (ifd->ifd_len != 0) { if (ifd->ifd_len > OVPN_MAX_REQUEST_SIZE) return (E2BIG); buf = malloc(ifd->ifd_len, M_OVPN, M_WAITOK); ret = copyin(ifd->ifd_data, buf, ifd->ifd_len); if (ret != 0) { free(buf, M_OVPN); return (ret); } nvl = nvlist_unpack(buf, ifd->ifd_len, 0); free(buf, M_OVPN); if (nvl == NULL) { return (EINVAL); } } switch (ifd->ifd_cmd) { case OVPN_NEW_PEER: ret = ovpn_new_peer(ifp, nvl); break; case OVPN_DEL_PEER: OVPN_WLOCK(sc); ret = ovpn_del_peer(ifp, nvl); OVPN_WUNLOCK(sc); break; case OVPN_NEW_KEY: ret = ovpn_set_key(ifp, nvl); break; case OVPN_START_VPN: ret = ovpn_start(ifp); break; case OVPN_SWAP_KEYS: ret = ovpn_swap_keys(ifp, nvl); break; case OVPN_DEL_KEY: ret = ovpn_del_key(ifp, nvl); break; case OVPN_SEND_PKT: ret = ovpn_send_pkt(ifp, nvl); break; case OVPN_SET_PEER: ret = ovpn_set_peer(ifp, nvl); break; default: ret = ENOTSUP; } nvlist_destroy(nvl); return (ret); } static int ovpn_add_counters(nvlist_t *parent, const char *name, counter_u64_t in, counter_u64_t out) { nvlist_t *nvl; nvl = nvlist_create(0); if (nvl == NULL) return (ENOMEM); nvlist_add_number(nvl, "in", counter_u64_fetch(in)); nvlist_add_number(nvl, "out", counter_u64_fetch(out)); nvlist_add_nvlist(parent, name, nvl); nvlist_destroy(nvl); return (0); } static int ovpn_get_stats(struct ovpn_softc *sc, nvlist_t **onvl) { nvlist_t *nvl; int ret; nvl = nvlist_create(0); if (nvl == NULL) return (ENOMEM); #define OVPN_COUNTER_OUT(name, in, out) \ do { \ ret = ovpn_add_counters(nvl, name, \ sc->counters[offsetof(struct ovpn_counters, in) / \ sizeof(uint64_t)], \ sc->counters[offsetof(struct ovpn_counters, out) / \ sizeof(uint64_t)]); \ if (ret != 0) \ goto error; \ } while(0) OVPN_COUNTER_OUT("lost_ctrl", lost_ctrl_pkts_in, lost_ctrl_pkts_out); OVPN_COUNTER_OUT("lost_data", lost_data_pkts_in, lost_data_pkts_out); OVPN_COUNTER_OUT("nomem_data", nomem_data_pkts_in, nomem_data_pkts_out); OVPN_COUNTER_OUT("data", received_data_pkts, sent_data_pkts); OVPN_COUNTER_OUT("ctrl", received_ctrl_pkts, sent_ctrl_pkts); OVPN_COUNTER_OUT("tunnel", tunnel_bytes_received, tunnel_bytes_received); OVPN_COUNTER_OUT("transport", transport_bytes_received, transport_bytes_received); #undef OVPN_COUNTER_OUT *onvl = nvl; return (0); error: nvlist_destroy(nvl); return (ret); } static int ovpn_poll_pkt(struct ovpn_softc *sc, nvlist_t **onvl) { nvlist_t *nvl; nvl = nvlist_create(0); if (nvl == NULL) return (ENOMEM); nvlist_add_number(nvl, "pending", buf_ring_count(sc->rxring) + buf_ring_count(sc->notifring)); *onvl = nvl; return (0); } static int opvn_get_pkt(struct ovpn_softc *sc, nvlist_t **onvl) { struct ovpn_notification *n; struct ovpn_wire_header *ohdr; struct mbuf *m; uint8_t *buf; nvlist_t *nvl; uint32_t peerid; u_int mlength; /* Check if we have notifications pending. */ n = buf_ring_dequeue_mc(sc->notifring); if (n != NULL) { nvl = nvlist_create(0); if (nvl == NULL) { free(n, M_OVPN); return (ENOMEM); } nvlist_add_number(nvl, "peerid", n->peerid); nvlist_add_number(nvl, "notification", n->type); free(n, M_OVPN); *onvl = nvl; return (0); } /* Queued packet. */ m = buf_ring_dequeue_mc(sc->rxring); if (m == NULL) return (ENOENT); mlength = m_length(m, NULL); buf = malloc(mlength, M_NVLIST, M_WAITOK); m_copydata(m, 0, mlength, buf); ohdr = (struct ovpn_wire_header *)buf; peerid = ntohl(ohdr->opcode) & 0x00ffffff; nvl = nvlist_create(0); if (nvl == NULL) { OVPN_COUNTER_ADD(sc, lost_ctrl_pkts_in, 1); m_freem(m); free(buf, M_NVLIST); return (ENOMEM); } nvlist_move_binary(nvl, "packet", buf, mlength); buf = NULL; nvlist_add_number(nvl, "peerid", peerid); *onvl = nvl; m_freem(m); return (0); } static int ovpn_ioctl_get(struct ifnet *ifp, struct ifdrv *ifd) { struct ovpn_softc *sc = ifp->if_softc; nvlist_t *nvl = NULL; int error; switch (ifd->ifd_cmd) { case OVPN_GET_STATS: error = ovpn_get_stats(sc, &nvl); break; case OVPN_POLL_PKT: error = ovpn_poll_pkt(sc, &nvl); break; case OVPN_GET_PKT: error = opvn_get_pkt(sc, &nvl); break; default: error = ENOTSUP; break; } if (error == 0) { void *packed = NULL; size_t len; MPASS(nvl != NULL); packed = nvlist_pack(nvl, &len); if (! packed) { nvlist_destroy(nvl); return (ENOMEM); } if (len > ifd->ifd_len) { free(packed, M_NVLIST); nvlist_destroy(nvl); return (ENOSPC); } error = copyout(packed, ifd->ifd_data, len); ifd->ifd_len = len; free(packed, M_NVLIST); nvlist_destroy(nvl); } return (error); } static int ovpn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifdrv *ifd; int error; switch (cmd) { case SIOCSDRVSPEC: case SIOCGDRVSPEC: error = priv_check(curthread, PRIV_NET_OVPN); if (error) return (error); break; } switch (cmd) { case SIOCSDRVSPEC: ifd = (struct ifdrv *)data; error = ovpn_ioctl_set(ifp, ifd); break; case SIOCGDRVSPEC: ifd = (struct ifdrv *)data; error = ovpn_ioctl_get(ifp, ifd); break; case SIOCSIFMTU: { struct ifreq *ifr = (struct ifreq *)data; if (ifr->ifr_mtu < OVPN_MTU_MIN || ifr->ifr_mtu > OVPN_MTU_MAX) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); } case SIOCSIFADDR: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMTU: case SIOCSIFFLAGS: return (0); default: error = EINVAL; } return (error); } static int ovpn_encrypt_tx_cb(struct cryptop *crp) { struct ovpn_kpeer *peer = crp->crp_opaque; struct ovpn_softc *sc = peer->sc; struct mbuf *m = crp->crp_buf.cb_mbuf; int ret; if (crp->crp_etype != 0) { crypto_freereq(crp); ovpn_peer_release_ref(peer, false); OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1); m_freem(m); return (0); } CURVNET_SET(sc->ifp->if_vnet); MPASS(crp->crp_buf.cb_type == CRYPTO_BUF_MBUF); ret = ovpn_encap(sc, peer->peerid, m); if (ret == 0) { OVPN_COUNTER_ADD(sc, sent_data_pkts, 1); OVPN_COUNTER_ADD(sc, tunnel_bytes_sent, m->m_pkthdr.len - sizeof(struct ovpn_wire_header)); } CURVNET_RESTORE(); crypto_freereq(crp); ovpn_peer_release_ref(peer, false); return (0); } static void ovpn_finish_rx(struct ovpn_softc *sc, struct mbuf *m, struct ovpn_kpeer *peer, struct ovpn_kkey *key, uint32_t seq, struct rm_priotracker *_ovpn_lock_trackerp) { uint32_t af; int ret __diagused; OVPN_RASSERT(sc); /* Replay protection. */ if (V_replay_protection && ! ovpn_check_replay(key->decrypt, seq)) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1); m_freem(m); return; } critical_enter(); *zpcpu_get(peer->last_active) = time_uptime; critical_exit(); OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, received_data_pkts, 1); OVPN_COUNTER_ADD(sc, tunnel_bytes_received, m->m_pkthdr.len); /* Receive the packet on our interface. */ m->m_pkthdr.rcvif = sc->ifp; /* Clear checksum flags in case the real hardware set them. */ m->m_pkthdr.csum_flags = 0; /* Ensure we can read the first byte. */ m = m_pullup(m, 1); if (m == NULL) { OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1); return; } /* * Check for address family, and disregard any control packets (e.g. * keepalive). */ af = ovpn_get_af(m); if (af != 0) { BPF_MTAP2(sc->ifp, &af, sizeof(af), m); ret = netisr_dispatch(af == AF_INET ? NETISR_IP : NETISR_IPV6, m); MPASS(ret == 0); } else { OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1); m_freem(m); } } static struct ovpn_kkey * ovpn_find_key(struct ovpn_softc *sc, struct ovpn_kpeer *peer, const struct ovpn_wire_header *ohdr) { struct ovpn_kkey *key = NULL; uint8_t keyid; OVPN_RASSERT(sc); keyid = (ntohl(ohdr->opcode) >> 24) & 0x07; if (peer->keys[0].keyid == keyid) key = &peer->keys[0]; else if (peer->keys[1].keyid == keyid) key = &peer->keys[1]; return (key); } static int ovpn_decrypt_rx_cb(struct cryptop *crp) { struct ovpn_softc *sc = crp->crp_opaque; struct mbuf *m = crp->crp_buf.cb_mbuf; struct ovpn_kkey *key; struct ovpn_kpeer *peer; struct ovpn_wire_header *ohdr; uint32_t peerid; OVPN_RLOCK_TRACKER; OVPN_RLOCK(sc); MPASS(crp->crp_buf.cb_type == CRYPTO_BUF_MBUF); if (crp->crp_etype != 0) { crypto_freereq(crp); atomic_add_int(&sc->refcount, -1); OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1); OVPN_RUNLOCK(sc); m_freem(m); return (0); } CURVNET_SET(sc->ifp->if_vnet); ohdr = mtodo(m, sizeof(struct udphdr)); peerid = ntohl(ohdr->opcode) & 0x00ffffff; peer = ovpn_find_peer(sc, peerid); if (peer == NULL) { /* No such peer. Drop packet. */ crypto_freereq(crp); atomic_add_int(&sc->refcount, -1); OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1); m_freem(m); CURVNET_RESTORE(); return (0); } key = ovpn_find_key(sc, peer, ohdr); if (key == NULL) { crypto_freereq(crp); atomic_add_int(&sc->refcount, -1); /* * Has this key been removed between us starting the decrypt * and finishing it? */ OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1); m_freem(m); CURVNET_RESTORE(); return (0); } /* Now remove the outer headers */ m_adj_decap(m, sizeof(struct udphdr) + sizeof(struct ovpn_wire_header)); ovpn_finish_rx(sc, m, peer, key, ntohl(ohdr->seq), _ovpn_lock_trackerp); OVPN_UNLOCK_ASSERT(sc); CURVNET_RESTORE(); crypto_freereq(crp); atomic_add_int(&sc->refcount, -1); return (0); } static int ovpn_get_af(struct mbuf *m) { struct ip *ip; struct ip6_hdr *ip6; /* * We should pullup, but we're only interested in the first byte, so * that'll always be contiguous. */ ip = mtod(m, struct ip *); if (ip->ip_v == IPVERSION) return (AF_INET); ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_vfc == IPV6_VERSION) return (AF_INET6); return (0); } #ifdef INET static struct ovpn_kpeer * ovpn_find_peer_by_ip(struct ovpn_softc *sc, const struct in_addr addr) { struct ovpn_kpeer *peer = NULL; OVPN_ASSERT(sc); for (int i = 0; i < OVPN_MAX_PEERS; i++) { if (sc->peers[i] == NULL) continue; if (addr.s_addr == sc->peers[i]->vpn4.s_addr) { peer = sc->peers[i]; break; } } return (peer); } #endif #ifdef INET6 static struct ovpn_kpeer * ovpn_find_peer_by_ip6(struct ovpn_softc *sc, const struct in6_addr *addr) { struct ovpn_kpeer *peer = NULL; OVPN_ASSERT(sc); for (int i = 0; i < OVPN_MAX_PEERS; i++) { if (sc->peers[i] == NULL) continue; if (memcmp(addr, &sc->peers[i]->vpn6, sizeof(*addr)) == 0) { peer = sc->peers[i]; break; } } return (peer); } #endif static struct ovpn_kpeer * ovpn_route_peer(struct ovpn_softc *sc, struct mbuf **m0, const struct sockaddr *dst) { struct ovpn_kpeer *peer = NULL; int af; NET_EPOCH_ASSERT(); OVPN_ASSERT(sc); /* Shortcut if we're a client (or are a server and have only one client). */ if (sc->peercount == 1) return (ovpn_find_only_peer(sc)); if (dst != NULL) af = dst->sa_family; else af = ovpn_get_af(*m0); switch (af) { #ifdef INET case AF_INET: { const struct sockaddr_in *sa = (const struct sockaddr_in *)dst; struct nhop_object *nh; const struct in_addr *ip_dst; if (sa != NULL) { ip_dst = &sa->sin_addr; } else { struct ip *ip; *m0 = m_pullup(*m0, sizeof(struct ip)); if (*m0 == NULL) return (NULL); ip = mtod(*m0, struct ip *); ip_dst = &ip->ip_dst; } peer = ovpn_find_peer_by_ip(sc, *ip_dst); SDT_PROBE2(if_ovpn, tx, route, ip4, ip_dst, peer); if (peer == NULL) { nh = fib4_lookup(M_GETFIB(*m0), *ip_dst, 0, NHR_NONE, 0); if (nh && (nh->nh_flags & NHF_GATEWAY)) { peer = ovpn_find_peer_by_ip(sc, nh->gw4_sa.sin_addr); SDT_PROBE2(if_ovpn, tx, route, ip4, &nh->gw4_sa.sin_addr, peer); } } break; } #endif #ifdef INET6 case AF_INET6: { const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)dst; struct nhop_object *nh; const struct in6_addr *ip6_dst; if (sa6 != NULL) { ip6_dst = &sa6->sin6_addr; } else { struct ip6_hdr *ip6; *m0 = m_pullup(*m0, sizeof(struct ip6_hdr)); if (*m0 == NULL) return (NULL); ip6 = mtod(*m0, struct ip6_hdr *); ip6_dst = &ip6->ip6_dst; } peer = ovpn_find_peer_by_ip6(sc, ip6_dst); SDT_PROBE2(if_ovpn, tx, route, ip6, ip6_dst, peer); if (peer == NULL) { nh = fib6_lookup(M_GETFIB(*m0), ip6_dst, 0, NHR_NONE, 0); if (nh && (nh->nh_flags & NHF_GATEWAY)) { peer = ovpn_find_peer_by_ip6(sc, &nh->gw6_sa.sin6_addr); SDT_PROBE2(if_ovpn, tx, route, ip6, &nh->gw6_sa.sin6_addr, peer); } } break; } #endif } return (peer); } static int ovpn_transmit(struct ifnet *ifp, struct mbuf *m) { return (ifp->if_output(ifp, m, NULL, NULL)); } static int ovpn_transmit_to_peer(struct ifnet *ifp, struct mbuf *m, struct ovpn_kpeer *peer, struct rm_priotracker *_ovpn_lock_trackerp) { struct ovpn_wire_header *ohdr; struct ovpn_kkey *key; struct ovpn_softc *sc; struct cryptop *crp; uint32_t af, seq; size_t len, ovpn_hdr_len; int tunnel_len; int ret; sc = ifp->if_softc; OVPN_RASSERT(sc); tunnel_len = m->m_pkthdr.len; key = &peer->keys[OVPN_KEY_SLOT_PRIMARY]; if (key->encrypt == NULL) { if (_ovpn_lock_trackerp != NULL) OVPN_RUNLOCK(sc); m_freem(m); return (ENOLINK); } af = ovpn_get_af(m); /* Don't capture control packets. */ if (af != 0) BPF_MTAP2(ifp, &af, sizeof(af), m); len = m->m_pkthdr.len; MPASS(len <= ifp->if_mtu); ovpn_hdr_len = sizeof(struct ovpn_wire_header); if (key->encrypt->cipher == OVPN_CIPHER_ALG_NONE) ovpn_hdr_len -= 16; /* No auth tag. */ M_PREPEND(m, ovpn_hdr_len, M_NOWAIT); if (m == NULL) { if (_ovpn_lock_trackerp != NULL) OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1); return (ENOBUFS); } ohdr = mtod(m, struct ovpn_wire_header *); ohdr->opcode = (OVPN_OP_DATA_V2 << OVPN_OP_SHIFT) | key->keyid; ohdr->opcode <<= 24; ohdr->opcode |= key->peerid; ohdr->opcode = htonl(ohdr->opcode); seq = atomic_fetchadd_32(&peer->tx_seq, 1); seq = htonl(seq); ohdr->seq = seq; if (key->encrypt->cipher == OVPN_CIPHER_ALG_NONE) { ret = ovpn_encap(sc, peer->peerid, m); if (_ovpn_lock_trackerp != NULL) OVPN_RUNLOCK(sc); if (ret == 0) { OVPN_COUNTER_ADD(sc, sent_data_pkts, 1); OVPN_COUNTER_ADD(sc, tunnel_bytes_sent, tunnel_len); } return (ret); } crp = crypto_getreq(key->encrypt->cryptoid, M_NOWAIT); if (crp == NULL) { if (_ovpn_lock_trackerp != NULL) OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1); m_freem(m); return (ENOBUFS); } /* Encryption covers only the payload, not the header. */ crp->crp_payload_start = sizeof(*ohdr); crp->crp_payload_length = len; crp->crp_op = CRYPTO_OP_ENCRYPT; /* * AAD data covers the ovpn_wire_header minus the auth * tag. */ crp->crp_aad_length = sizeof(*ohdr) - sizeof(ohdr->auth_tag); crp->crp_aad = ohdr; crp->crp_aad_start = 0; crp->crp_op |= CRYPTO_OP_COMPUTE_DIGEST; crp->crp_digest_start = offsetof(struct ovpn_wire_header, auth_tag); crp->crp_flags |= CRYPTO_F_IV_SEPARATE; memcpy(crp->crp_iv, &seq, sizeof(seq)); memcpy(crp->crp_iv + sizeof(seq), key->encrypt->nonce, key->encrypt->noncelen); crypto_use_mbuf(crp, m); crp->crp_flags |= CRYPTO_F_CBIFSYNC; crp->crp_callback = ovpn_encrypt_tx_cb; crp->crp_opaque = peer; atomic_add_int(&peer->refcount, 1); if (_ovpn_lock_trackerp != NULL) OVPN_RUNLOCK(sc); ret = crypto_dispatch(crp); if (ret) { OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1); } return (ret); } /* * Note: Expects to hold the read lock on entry, and will release it itself. */ static int ovpn_encap(struct ovpn_softc *sc, uint32_t peerid, struct mbuf *m) { struct udphdr *udp; struct ovpn_kpeer *peer; int len; OVPN_RLOCK_TRACKER; OVPN_RLOCK(sc); NET_EPOCH_ASSERT(); peer = ovpn_find_peer(sc, peerid); if (peer == NULL || sc->ifp->if_link_state != LINK_STATE_UP) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1); m_freem(m); return (ENETDOWN); } len = m->m_pkthdr.len; M_PREPEND(m, sizeof(struct udphdr), M_NOWAIT); if (m == NULL) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1); m_freem(m); return (ENOBUFS); } udp = mtod(m, struct udphdr *); MPASS(peer->local.ss_family == peer->remote.ss_family); udp->uh_sport = ovpn_get_port(&peer->local); udp->uh_dport = ovpn_get_port(&peer->remote); udp->uh_ulen = htons(sizeof(struct udphdr) + len); switch (peer->remote.ss_family) { #ifdef INET case AF_INET: { struct sockaddr_in *in_local = TO_IN(&peer->local); struct sockaddr_in *in_remote = TO_IN(&peer->remote); struct ip *ip; /* * This requires knowing the source IP, which we don't. Happily * we're allowed to keep this at 0, and the checksum won't do * anything the crypto won't already do. */ udp->uh_sum = 0; /* Set the checksum flags so we recalculate checksums. */ m->m_pkthdr.csum_flags |= CSUM_IP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1); return (ENOBUFS); } ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct udphdr) + len); ip->ip_off = 0; ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_UDP; ip->ip_sum = 0; if (in_local->sin_port != 0) ip->ip_src = in_local->sin_addr; else ip->ip_src.s_addr = INADDR_ANY; ip->ip_dst = in_remote->sin_addr; OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, transport_bytes_sent, m->m_pkthdr.len); return (ip_output(m, NULL, NULL, 0, NULL, NULL)); } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *in6_local = TO_IN6(&peer->local); struct sockaddr_in6 *in6_remote = TO_IN6(&peer->remote); struct ip6_hdr *ip6; M_PREPEND(m, sizeof(struct ip6_hdr), M_NOWAIT); if (m == NULL) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1); return (ENOBUFS); } m = m_pullup(m, sizeof(*ip6) + sizeof(*udp)); if (m == NULL) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1); return (ENOBUFS); } ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK; ip6->ip6_plen = htons(sizeof(*ip6) + sizeof(struct udphdr) + len); ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_hlim = V_ip6_defhlim; memcpy(&ip6->ip6_src, &in6_local->sin6_addr, sizeof(ip6->ip6_src)); memcpy(&ip6->ip6_dst, &in6_remote->sin6_addr, sizeof(ip6->ip6_dst)); udp = mtodo(m, sizeof(*ip6)); udp->uh_sum = in6_cksum_pseudo(ip6, m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0); m->m_pkthdr.csum_flags |= CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, transport_bytes_sent, m->m_pkthdr.len); return (ip6_output(m, NULL, NULL, IPV6_UNSPECSRC, NULL, NULL, NULL)); } #endif default: panic("Unsupported address family %d", peer->remote.ss_family); } } static int ovpn_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ovpn_softc *sc; struct ovpn_kpeer *peer; OVPN_RLOCK_TRACKER; sc = ifp->if_softc; OVPN_RLOCK(sc); SDT_PROBE1(if_ovpn, tx, transmit, start, m); if (__predict_false(ifp->if_link_state != LINK_STATE_UP)) { OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1); OVPN_RUNLOCK(sc); m_freem(m); return (ENETDOWN); } /** * Only obey 'dst' (i.e. the gateway) if no route is supplied. * That's our indication that we're being called through pf's route-to, * and we should route according to 'dst' instead. We can't do so * consistently, because the usual openvpn configuration sets the first * non-server IP in the subnet as the gateway. If we always use that * one we'd end up routing all traffic to the first client. * tl;dr: 'ro == NULL' tells us pf is doing a route-to, and then but * only then, we should treat 'dst' as the destination. */ peer = ovpn_route_peer(sc, &m, ro == NULL ? dst : NULL); if (peer == NULL) { /* No destination. */ OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1); OVPN_RUNLOCK(sc); m_freem(m); return (ENETDOWN); } return (ovpn_transmit_to_peer(ifp, m, peer, _ovpn_lock_trackerp)); } static void ovpn_rcv_ctrl(struct ovpn_softc *sc, struct mbuf *m, int off) { /* Lop off the IP and UDP headers */ m_adj_decap(m, off); /* Keep in the local ring until userspace fetches it. */ if (buf_ring_enqueue(sc->rxring, m) != 0) { OVPN_COUNTER_ADD(sc, lost_ctrl_pkts_in, 1); m_freem(m); return; } OVPN_COUNTER_ADD(sc, received_ctrl_pkts, 1); } static bool ovpn_check_replay(struct ovpn_kkey_dir *key, uint32_t seq) { uint32_t d; mtx_lock(&key->replay_mtx); /* Sequence number must be strictly greater than rx_seq */ if (seq <= key->rx_seq) { mtx_unlock(&key->replay_mtx); return (false); } /* Large jump. The packet authenticated okay, so just accept that. */ if (seq > (key->rx_seq + (sizeof(key->rx_window) * 8))) { key->rx_seq = seq; key->rx_window = 0; mtx_unlock(&key->replay_mtx); return (true); } /* Happy case. */ if ((seq == key->rx_seq + 1) && key->rx_window == 0) { key->rx_seq++; mtx_unlock(&key->replay_mtx); return (true); } d = seq - key->rx_seq - 1; if (key->rx_window & ((uint64_t)1 << d)) { /* Dupe! */ mtx_unlock(&key->replay_mtx); return (false); } key->rx_window |= (uint64_t)1 << d; while (key->rx_window & 1) { key->rx_seq++; key->rx_window >>= 1; } mtx_unlock(&key->replay_mtx); return (true); } static struct ovpn_kpeer * ovpn_peer_from_mbuf(struct ovpn_softc *sc, struct mbuf *m, int off) { struct ovpn_wire_header ohdr; uint32_t peerid; const size_t hdrlen = sizeof(ohdr) - sizeof(ohdr.auth_tag); OVPN_RASSERT(sc); if (m_length(m, NULL) < (off + sizeof(struct udphdr) + hdrlen)) return (NULL); m_copydata(m, off + sizeof(struct udphdr), hdrlen, (caddr_t)&ohdr); peerid = ntohl(ohdr.opcode) & 0x00ffffff; return (ovpn_find_peer(sc, peerid)); } static bool ovpn_udp_input(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ovpn_softc *sc = ctx; struct ovpn_wire_header *ohdr; struct udphdr *uhdr; struct ovpn_kkey *key; struct cryptop *crp; struct ovpn_kpeer *peer; size_t ohdrlen; int ret; uint8_t op; OVPN_RLOCK_TRACKER; M_ASSERTPKTHDR(m); OVPN_COUNTER_ADD(sc, transport_bytes_received, m->m_pkthdr.len - off); ohdrlen = sizeof(*ohdr) - sizeof(ohdr->auth_tag); OVPN_RLOCK(sc); peer = ovpn_peer_from_mbuf(sc, m, off); if (peer == NULL) { OVPN_RUNLOCK(sc); return (false); } m = m_pullup(m, off + sizeof(*uhdr) + ohdrlen); if (m == NULL) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1); return (true); } uhdr = mtodo(m, off); ohdr = mtodo(m, off + sizeof(*uhdr)); op = ntohl(ohdr->opcode) >> 24 >> OVPN_OP_SHIFT; /* * Simplify things by getting rid of the preceding headers, we don't * care about them. */ m_adj_decap(m, off); uhdr = mtodo(m, 0); ohdr = mtodo(m, sizeof(*uhdr)); if (op != OVPN_OP_DATA_V2) { OVPN_RUNLOCK(sc); ovpn_rcv_ctrl(sc, m, sizeof(struct udphdr)); INP_WLOCK(inp); udp_notify(inp, EAGAIN); INP_WUNLOCK(inp); return (true); } key = ovpn_find_key(sc, peer, ohdr); if (key == NULL || key->decrypt == NULL) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1); m_freem(m); return (true); } if (key->decrypt->cipher == OVPN_CIPHER_ALG_NONE) { /* Now remove the outer headers */ m_adj_decap(m, sizeof(struct udphdr) + ohdrlen); ohdr = mtodo(m, sizeof(*uhdr)); ovpn_finish_rx(sc, m, peer, key, ntohl(ohdr->seq), _ovpn_lock_trackerp); OVPN_UNLOCK_ASSERT(sc); return (true); } ohdrlen += sizeof(ohdr->auth_tag); m = m_pullup(m, sizeof(*uhdr) + ohdrlen); if (m == NULL) { OVPN_RUNLOCK(sc); OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1); return (true); } uhdr = mtodo(m, 0); ohdr = mtodo(m, sizeof(*uhdr)); /* Decrypt */ crp = crypto_getreq(key->decrypt->cryptoid, M_NOWAIT); if (crp == NULL) { OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1); OVPN_RUNLOCK(sc); m_freem(m); return (true); } crp->crp_payload_start = sizeof(struct udphdr) + sizeof(*ohdr); crp->crp_payload_length = ntohs(uhdr->uh_ulen) - sizeof(*uhdr) - sizeof(*ohdr); crp->crp_op = CRYPTO_OP_DECRYPT; /* AAD validation. */ crp->crp_aad_length = sizeof(*ohdr) - sizeof(ohdr->auth_tag); crp->crp_aad = ohdr; crp->crp_aad_start = 0; crp->crp_op |= CRYPTO_OP_VERIFY_DIGEST; crp->crp_digest_start = sizeof(struct udphdr) + offsetof(struct ovpn_wire_header, auth_tag); crp->crp_flags |= CRYPTO_F_IV_SEPARATE; memcpy(crp->crp_iv, &ohdr->seq, sizeof(ohdr->seq)); memcpy(crp->crp_iv + sizeof(ohdr->seq), key->decrypt->nonce, key->decrypt->noncelen); crypto_use_mbuf(crp, m); crp->crp_flags |= CRYPTO_F_CBIFSYNC; crp->crp_callback = ovpn_decrypt_rx_cb; crp->crp_opaque = sc; atomic_add_int(&sc->refcount, 1); OVPN_RUNLOCK(sc); ret = crypto_dispatch(crp); if (ret != 0) { OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1); } return (true); } static void ovpn_qflush(struct ifnet *ifp __unused) { } static void ovpn_flush_rxring(struct ovpn_softc *sc) { struct mbuf *m; struct ovpn_notification *n; OVPN_WASSERT(sc); while (! buf_ring_empty(sc->rxring)) { m = buf_ring_dequeue_sc(sc->rxring); m_freem(m); } while (! buf_ring_empty(sc->notifring)) { n = buf_ring_dequeue_sc(sc->notifring); free(n, M_OVPN); } } #ifdef VIMAGE static void ovpn_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct ovpn_softc *sc = ifp->if_softc; int i; int ret __diagused; i = 0; OVPN_WLOCK(sc); /* Flush keys & configuration. */ do { if (sc->peers[i] != NULL) { ret = _ovpn_del_peer(sc, sc->peers[i]->peerid); MPASS(ret == 0); } i++; } while (i < OVPN_MAX_PEERS); ovpn_flush_rxring(sc); OVPN_WUNLOCK(sc); } #endif static int ovpn_clone_match(struct if_clone *ifc, const char *name) { /* * Allow all names that start with 'ovpn', specifically because pfSense * uses ovpnc1 / ovpns2 */ return (strncmp(ovpnname, name, strlen(ovpnname)) == 0); } static int -ovpn_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +ovpn_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct ovpn_softc *sc; struct ifnet *ifp; char *dp; int error, unit, wildcard; /* Try to see if a special unit was requested. */ error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); error = ifc_alloc_unit(ifc, &unit); if (error != 0) return (error); /* * If no unit had been given, we need to adjust the ifName. */ for (dp = name; *dp != '\0'; dp++); if (wildcard) { error = snprintf(dp, len - (dp - name), "%d", unit); if (error > len - (dp - name)) { /* ifName too long. */ ifc_free_unit(ifc, unit); return (ENOSPC); } dp += error; } /* Make sure it doesn't already exist. */ if (ifunit(name) != NULL) return (EEXIST); sc = malloc(sizeof(struct ovpn_softc), M_OVPN, M_WAITOK | M_ZERO); sc->ifp = if_alloc(IFT_ENC); rm_init_flags(&sc->lock, "if_ovpn_lock", RM_RECURSE); sc->refcount = 0; sc->rxring = buf_ring_alloc(32, M_OVPN, M_WAITOK, NULL); sc->notifring = buf_ring_alloc(32, M_OVPN, M_WAITOK, NULL); COUNTER_ARRAY_ALLOC(sc->counters, OVPN_COUNTER_SIZE, M_WAITOK); ifp = sc->ifp; ifp->if_softc = sc; strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = ovpngroupname; ifp->if_dunit = unit; ifp->if_addrlen = 0; ifp->if_mtu = 1428; ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; ifp->if_ioctl = ovpn_ioctl; ifp->if_transmit = ovpn_transmit; ifp->if_output = ovpn_output; ifp->if_qflush = ovpn_qflush; #ifdef VIMAGE ifp->if_reassign = ovpn_reassign; #endif ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(uint32_t)); + *ifpp = ifp; return (0); } static void ovpn_clone_destroy_cb(struct epoch_context *ctx) { struct ovpn_softc *sc; sc = __containerof(ctx, struct ovpn_softc, epoch_ctx); MPASS(sc->peercount == 0); for (int i = 0; i < OVPN_MAX_PEERS; i++) { MPASS(sc->peers[i] == NULL); } COUNTER_ARRAY_FREE(sc->counters, OVPN_COUNTER_SIZE); if_free(sc->ifp); free(sc, M_OVPN); } static int -ovpn_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +ovpn_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct ovpn_softc *sc; int unit; int i; int ret __diagused; sc = ifp->if_softc; unit = ifp->if_dunit; OVPN_WLOCK(sc); if (atomic_load_int(&sc->refcount) > 0) { OVPN_WUNLOCK(sc); return (EBUSY); } i = 0; do { if (sc->peers[i] != NULL) { ret = _ovpn_del_peer(sc, sc->peers[i]->peerid); MPASS(ret == 0); } i++; } while (i < OVPN_MAX_PEERS); ovpn_flush_rxring(sc); buf_ring_free(sc->rxring, M_OVPN); buf_ring_free(sc->notifring, M_OVPN); OVPN_WUNLOCK(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; NET_EPOCH_CALL(ovpn_clone_destroy_cb, &sc->epoch_ctx); if (unit != IF_DUNIT_NONE) ifc_free_unit(ifc, unit); NET_EPOCH_DRAIN_CALLBACKS(); return (0); } static void vnet_ovpn_init(const void *unused __unused) { - V_ovpn_cloner = if_clone_advanced(ovpngroupname, 0, ovpn_clone_match, - ovpn_clone_create, ovpn_clone_destroy); + struct if_clone_addreq req = { + .match_f = ovpn_clone_match, + .create_f = ovpn_clone_create, + .destroy_f = ovpn_clone_destroy, + }; + V_ovpn_cloner = ifc_attach_cloner(ovpngroupname, &req); } VNET_SYSINIT(vnet_ovpn_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ovpn_init, NULL); static void vnet_ovpn_uninit(const void *unused __unused) { if_clone_detach(V_ovpn_cloner); } VNET_SYSUNINIT(vnet_ovpn_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ovpn_uninit, NULL); static int ovpnmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: /* Done in vnet_ovpn_init() */ break; case MOD_UNLOAD: /* Done in vnet_ovpn_uninit() */ break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t ovpn_mod = { "if_ovpn", ovpnmodevent, 0 }; DECLARE_MODULE(if_ovpn, ovpn_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_ovpn, 1); diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c index 51ea0a61ae0d..9a469a82c34c 100644 --- a/sys/net/if_stf.c +++ b/sys/net/if_stf.c @@ -1,1058 +1,1062 @@ /* $FreeBSD$ */ /* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 2000 WIDE Project. * Copyright (c) 2010 Hiroki Sato * Copyright (c) 2013 Ermal Luci * Copyright (c) 2017-2021 Rubicon Communications, LLC (Netgate) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * 6to4 interface, based on RFC3056. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 * address. Therefore, we do not have IFF_MULTICAST on the interface. * * Due to the lack of address mapping for link-local addresses, we cannot * throw packets toward link-local addresses (fe80::x). Also, we cannot throw * packets to link-local multicast addresses (ff02::x). * * Here are interesting symptoms due to the lack of link-local address: * * Unicast routing exchange: * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, * and link-local addresses as nexthop. * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address * assigned to the link, and makes use of them. Also, HELLO packets use * link-local multicast addresses (ff02::5 and ff02::6). * - BGP4+: Maybe. You can only use global address as nexthop, and global * address as TCP endpoint address. * * Multicast routing protocols: * - PIM: Hello packet cannot be used to discover adjacent PIM routers. * Adjacent PIM routers must be configured manually (is it really spec-wise * correct thing to do?). * * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no * encapsulation defined for link-local address), and the above analysis does * not change. RFC3056 does not mandate the assignment of link-local address * either. * * 6to4 interface has security issues. Refer to * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DEFINE(if_stf); SDT_PROBE_DEFINE3(if_stf, , encapcheck, in, "struct mbuf *", "int", "int"); SDT_PROBE_DEFINE0(if_stf, , encapcheck, accept); SDT_PROBE_DEFINE3(if_stf, , getsrcifa6, in, "struct ifnet *", "struct in6_addr *", "struct in6_addr *"); SDT_PROBE_DEFINE2(if_stf, , getsrcifa6, found, "struct in6_addr *", "struct in6_addr *"); SDT_PROBE_DEFINE0(if_stf, , getsrcifa6, notfound); SDT_PROBE_DEFINE4(if_stf, , stf_output, in, "struct ifnet *", "struct mbuf *", "struct sockaddr *", "struct route *"); SDT_PROBE_DEFINE2(if_stf, , stf_output, error, "int", "int"); SDT_PROBE_DEFINE1(if_stf, , stf_output, out, "int"); SDT_PROBE_DEFINE3(if_stf, , checkaddr6, in, "struct stf_softc *", "struct in6_addr *", "struct ifnet *"); SDT_PROBE_DEFINE2(if_stf, , checkaddr6, out, "int", "int"); SDT_PROBE_DEFINE3(if_stf, , stf_input, in, "struct mbuf *", "int", "int"); SDT_PROBE_DEFINE2(if_stf, , stf_input, out, "int", "int"); SDT_PROBE_DEFINE3(if_stf, , ioctl, sv4net, "struct in_addr *", "struct in_addr *", "int"); SDT_PROBE_DEFINE1(if_stf, , ioctl, sdstv4, "struct in_addr *"); SDT_PROBE_DEFINE1(if_stf, , ioctl, ifaddr, "struct ifaddr *"); SDT_PROBE_DEFINE4(if_stf, , getin4addr_in6, out, "struct in6_addr *", "struct in6_addr *", "struct in6_addr *", "struct sockaddr_in *"); SDT_PROBE_DEFINE2(if_stf, , getin4addr, in, "struct in6_addr *", "struct in6_addr *"); SDT_PROBE_DEFINE1(if_stf, , getin4addr, out, "struct sockaddr_in *"); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "6to4 Interface"); static int stf_permit_rfc1918 = 0; SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ #define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; in_addr_t braddr; /* Border relay IPv4 address */ in_addr_t srcv4_addr; /* Our IPv4 WAN address */ u_int v4prefixlen; /* How much of the v4 address to include in our address. */ u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; static int in_stf_input(struct mbuf *, int, int, void *); static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); static struct sockaddr_in *stf_getin4addr_in6(struct stf_softc *, struct sockaddr_in *, struct in6_addr, struct in6_addr, struct in6_addr); static struct sockaddr_in *stf_getin4addr(struct stf_softc *, struct sockaddr_in *, struct in6_addr, struct in6_addr); static int stf_ioctl(struct ifnet *, u_long, caddr_t); -static int stf_clone_match(struct if_clone *, const char *); -static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); -static int stf_clone_destroy(struct if_clone *, struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, stf_cloner); #define V_stf_cloner VNET(stf_cloner) static const struct encap_config ipv4_encap_cfg = { .proto = IPPROTO_IPV6, .min_length = sizeof(struct ip), .exact_match = (sizeof(in_addr_t) << 3) + 8, .check = stf_encapcheck, .input = in_stf_input }; static int stf_clone_match(struct if_clone *ifc, const char *name) { int i; for(i = 0; stfnames[i] != NULL; i++) { if (strcmp(stfnames[i], name) == 0) return (1); } return (0); } static int -stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +stf_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { char *dp; int err, unit, wildcard; struct stf_softc *sc; struct ifnet *ifp; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); /* * We can only have one unit, but since unit allocation is * already locked, we use it to keep from allocating extra * interfaces. */ unit = STFUNIT; err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); ifp = STF2IFP(sc) = if_alloc(IFT_STF); if (ifp == NULL) { free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_softc = sc; sc->sc_fibnum = curthread->td_proc->p_fibnum; /* * Set the name manually rather then using if_initname because * we don't conform to the default naming convention for interfaces. * In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; sc->encap_cookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); free(sc, M_STF); ifc_free_unit(ifc, unit); return (ENOMEM); } ifp->if_mtu = IPV6_MMTU; ifp->if_ioctl = stf_ioctl; ifp->if_output = stf_output; ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + *ifpp = ifp; + return (0); } static int -stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct stf_softc *sc = ifp->if_softc; int err __unused; err = ip_encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_STF); ifc_free_unit(ifc, STFUNIT); return (0); } static void vnet_stf_init(const void *unused __unused) { - V_stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, - stf_clone_create, stf_clone_destroy); + struct if_clone_addreq req = { + .match_f = stf_clone_match, + .create_f = stf_clone_create, + .destroy_f = stf_clone_destroy, + }; + V_stf_cloner = ifc_attach_cloner(stfname, &req); } VNET_SYSINIT(vnet_stf_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_stf_init, NULL); static void vnet_stf_uninit(const void *unused __unused) { if_clone_detach(V_stf_cloner); V_stf_cloner = NULL; } VNET_SYSUNINIT(vnet_stf_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_stf_uninit, NULL); static int stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: /* Done in vnet_stf_init() */ break; case MOD_UNLOAD: /* Done in vnet_stf_uninit() */ break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t stf_mod = { "if_stf", stfmodevent, 0 }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_stf, 2); static int stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct stf_softc *sc; struct in6_addr addr6, mask6; struct sockaddr_in sin4addr, sin4mask; SDT_PROBE3(if_stf, , encapcheck, in, m, off, proto); sc = (struct stf_softc *)arg; if (sc == NULL) return (0); if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) return (0); /* IFF_LINK0 means "no decapsulation" */ if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) return (0); if (proto != IPPROTO_IPV6) return (0); m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); if (ip.ip_v != 4) return (0); if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) return (0); if (sc->srcv4_addr != INADDR_ANY) { sin4addr.sin_addr.s_addr = sc->srcv4_addr; sin4addr.sin_family = AF_INET; } else if (stf_getin4addr(sc, &sin4addr, addr6, mask6) == NULL) return (0); if (sin4addr.sin_addr.s_addr != ip.ip_dst.s_addr) return (0); if (IN6_IS_ADDR_6TO4(&addr6)) { /* * 6to4 (RFC 3056). * Check if IPv4 src matches the IPv4 address derived * from the local 6to4 address masked by prefixmask. * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ memcpy(&sin4mask.sin_addr, GET_V4(&mask6), sizeof(sin4mask.sin_addr)); if ((sin4addr.sin_addr.s_addr & sin4mask.sin_addr.s_addr) != (ip.ip_src.s_addr & sin4mask.sin_addr.s_addr)) return (0); } else { /* 6rd (RFC 5569) */ /* * No restriction on the src address in the case of * 6rd because the stf(4) interface always has a * prefix which covers whole of IPv4 src address * range. So, stf_output() will catch all of * 6rd-capsuled IPv4 traffic with suspicious inner dst * IPv4 address (i.e. the IPv6 destination address is * one the admin does not like to route to outside), * and then it discard them silently. */ } SDT_PROBE0(if_stf, , encapcheck, accept); /* stf interface makes single side match only */ return (32); } static int stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct ifaddr *ia; struct in_ifaddr *ia4; struct in6_addr addr6, mask6; struct sockaddr_in sin4; struct stf_softc *sc; struct in_addr in; NET_EPOCH_ASSERT(); sc = ifp->if_softc; SDT_PROBE3(if_stf, , getsrcifa6, in, ifp, addr, mask); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET6) continue; addr6 = *IFA_IN6(ia); mask6 = *IFA_MASKIN6(ia); if (sc->srcv4_addr != INADDR_ANY) bcopy(&sc->srcv4_addr, &in, sizeof(in)); else { if (stf_getin4addr(sc, &sin4, addr6, mask6) == NULL) continue; bcopy(&sin4.sin_addr, &in, sizeof(in)); } CK_LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) break; if (ia4 == NULL) continue; *addr = addr6; *mask = mask6; SDT_PROBE2(if_stf, , getsrcifa6, found, addr, mask); return (0); } SDT_PROBE0(if_stf, , getsrcifa6, notfound); return (ENOENT); } static int stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct sockaddr_in dst4, src4; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; struct in6_addr addr6, mask6; int error; SDT_PROBE4(if_stf, , stf_output, in, ifp, m, dst, ro); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); SDT_PROBE2(if_stf, , stf_output, error, error, __LINE__); return (error); } #endif sc = ifp->if_softc; dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SDT_PROBE2(if_stf, , stf_output, error, ENETDOWN, __LINE__); return (ENETDOWN); } /* * If we don't have an ip4 address that match my inner ip6 address, * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SDT_PROBE2(if_stf, , stf_output, error, ENETDOWN, __LINE__); return (ENETDOWN); } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SDT_PROBE2(if_stf, , stf_output, error, ENOBUFS, __LINE__); return (ENOBUFS); } } ip6 = mtod(m, struct ip6_hdr *); tos = IPV6_TRAFFIC_CLASS(ip6); /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ if (stf_getin4addr_in6(sc, &dst4, addr6, mask6, ip6->ip6_dst) == NULL) { if (sc->braddr != INADDR_ANY) dst4.sin_addr.s_addr = sc->braddr; else if (stf_getin4addr_in6(sc, &dst4, addr6, mask6, dst6->sin6_addr) == NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SDT_PROBE2(if_stf, , stf_output, error, ENETUNREACH, __LINE__); return (ENETUNREACH); } } if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SDT_PROBE2(if_stf, , stf_output, error, ENOBUFS, __LINE__); return (ENOBUFS); } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); if (sc->srcv4_addr != INADDR_ANY) src4.sin_addr.s_addr = sc->srcv4_addr; else if (stf_getin4addr(sc, &src4, addr6, mask6) == NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SDT_PROBE2(if_stf, , stf_output, error, ENETUNREACH, __LINE__); return (ENETUNREACH); } bcopy(&src4.sin_addr, &ip->ip_src, sizeof(ip->ip_src)); bcopy(&dst4.sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); M_SETFIB(m, sc->sc_fibnum); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); error = ip_output(m, NULL, NULL, 0, NULL, NULL); SDT_PROBE1(if_stf, , stf_output, out, error); return (error); } static int isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) return (1); return (0); } static int stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { struct in_ifaddr *ia4; /* * reject packets with the following address: * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 */ if (IN_MULTICAST(ntohl(in->s_addr))) return (-1); switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return (-1); } /* * reject packets with broadcast */ CK_STAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { return (-1); } } /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { struct nhop_object *nh; NET_EPOCH_ASSERT(); nh = fib4_lookup(sc->sc_fibnum, *in, 0, 0, 0); if (nh == NULL) return (-1); if (nh->nh_ifp != inifp) return (-1); } return (0); } static int stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { SDT_PROBE3(if_stf, , checkaddr6, in, sc, in6, inifp); /* * check 6to4 addresses */ if (IN6_IS_ADDR_6TO4(in6)) { struct in_addr in4; int ret; bcopy(GET_V4(in6), &in4, sizeof(in4)); ret = stf_checkaddr4(sc, &in4, inifp); SDT_PROBE2(if_stf, , checkaddr6, out, ret, __LINE__); return (ret); } /* * reject anything that look suspicious. the test is implemented * in ip6_input too, but we check here as well to * (1) reject bad packets earlier, and * (2) to be safe against future ip6_input change. */ if (IN6_IS_ADDR_V4COMPAT(in6)) { SDT_PROBE2(if_stf, , checkaddr6, out, -1, __LINE__); return (-1); } if (IN6_IS_ADDR_V4MAPPED(in6)) { SDT_PROBE2(if_stf, , checkaddr6, out, -1, __LINE__); return (-1); } SDT_PROBE2(if_stf, , checkaddr6, out, 0, __LINE__); return (0); } static int in_stf_input(struct mbuf *m, int off, int proto, void *arg) { struct stf_softc *sc = arg; struct ip ip; struct ip6_hdr *ip6; u_int8_t otos, itos; struct ifnet *ifp; struct nhop_object *nh; NET_EPOCH_ASSERT(); SDT_PROBE3(if_stf, , stf_input, in, m, off, proto); if (proto != IPPROTO_IPV6) { m_freem(m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } m_copydata(m, 0, sizeof(struct ip), (caddr_t)&ip); if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } ifp = STF2IFP(sc); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr4(sc, &ip.ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip.ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } otos = ip.ip_tos; m_adj(m, off); if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. */ if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } /* * reject packets with private address range. * (requirement from RFC3056 section 2 1st paragraph) */ if ((IN6_IS_ADDR_6TO4(&ip6->ip6_src) && isrfc1918addr(&ip.ip_src)) || (IN6_IS_ADDR_6TO4(&ip6->ip6_dst) && isrfc1918addr(&ip.ip_dst))) { m_freem(m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } /* * Ignore if the destination is the same stf interface because * all of valid IPv6 outgoing traffic should go interfaces * except for it. */ nh = fib6_lookup(sc->sc_fibnum, &ip6->ip6_dst, 0, 0, 0); if (nh == NULL) { m_free(m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } if ((nh->nh_ifp == ifp) && (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &nh->gw6_sa.sin6_addr))) { m_free(m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } itos = IPV6_TRAFFIC_CLASS(ip6); if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); else ip_ecn_egress(ECN_NOCARE, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as * a four byte field. Cons up a dummy header * to pacify bpf. This is safe because bpf * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ u_int32_t af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } /* * Put the packet to the network layer input queue according to the * specified address family. * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); SDT_PROBE2(if_stf, , stf_input, out, IPPROTO_DONE, __LINE__); return (IPPROTO_DONE); } static struct sockaddr_in * stf_getin4addr_in6(struct stf_softc *sc, struct sockaddr_in *sin, struct in6_addr addr6, struct in6_addr mask6, struct in6_addr in6) { int i; struct sockaddr_in *out; /* * When (src addr & src mask) != (in6 & src mask), * the dst is not in the 6rd domain. The IPv4 address must * not be used. */ for (i = 0; i < sizeof(addr6); i++) { if ((((u_char *)&addr6)[i] & ((u_char *)&mask6)[i]) != (((u_char *)&in6)[i] & ((u_char *)&mask6)[i])) { SDT_PROBE4(if_stf, , getin4addr_in6, out, &addr6, &mask6, &in6, NULL); return (NULL); } } /* After the mask check, use in6 instead of addr6. */ out = stf_getin4addr(sc, sin, in6, mask6); SDT_PROBE4(if_stf, , getin4addr_in6, out, &addr6, &mask6, &in6, out); return (out); } static struct sockaddr_in * stf_getin4addr(struct stf_softc *sc, struct sockaddr_in *sin, struct in6_addr addr6, struct in6_addr mask6) { struct in_addr *in; SDT_PROBE2(if_stf, , getin4addr, in, &addr6, &mask6); memset(sin, 0, sizeof(*sin)); in = &sin->sin_addr; if (IN6_IS_ADDR_6TO4(&addr6)) { /* 6to4 (RFC 3056) */ bcopy(GET_V4(&addr6), in, sizeof(*in)); if (isrfc1918addr(in)) return (NULL); } else { /* 6rd (RFC 5569) */ in_addr_t v4prefix; uint8_t *v6 = (uint8_t*)&addr6; uint64_t v6prefix; u_int plen; u_int v4suffixlen; v4prefix = 0; if (sc->v4prefixlen < 32) { v4suffixlen = 32 - sc->v4prefixlen; v4prefix = ntohl(sc->srcv4_addr) & (0xffffffffU << v4suffixlen); } else { MPASS(sc->v4prefixlen == 32); v4suffixlen = 32; } plen = in6_mask2len(&mask6, NULL); if (plen > 64) return (NULL); /* To make this simple we do not support prefixes longer than * 64 bits. RFC5969 says "a 6rd delegated prefix SHOULD be /64 * or shorter." so this is a moderately safe assumption. */ v6prefix = be64toh(*(uint64_t *)v6); /* Shift away the v6 prefix itself. */ v6prefix <<= plen; v6prefix >>= plen; /* Now shift away everything after the v4 address. */ v6prefix >>= 64 - plen - v4suffixlen; sin->sin_addr.s_addr = htonl(v4prefix | (uint32_t)v6prefix); } SDT_PROBE1(if_stf, , getin4addr, out, sin); return (sin); } static int stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifdrv *ifd; struct ifreq *ifr; struct sockaddr_in sin4; struct stf_softc *sc_cur; struct stfv4args args; int error, mtu; error = 0; sc_cur = ifp->if_softc; switch (cmd) { case SIOCSDRVSPEC: ifd = (struct ifdrv *)data; error = priv_check(curthread, PRIV_NET_ADDIFADDR); if (error) break; if (ifd->ifd_cmd == STF6RD_SV4NET) { if (ifd->ifd_len != sizeof(args)) { error = EINVAL; break; } bzero(&args, sizeof(args)); error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) break; if (args.v4_prefixlen < 1 || args.v4_prefixlen > 32) { error = EINVAL; break; } bcopy(&args.srcv4_addr, &sc_cur->srcv4_addr, sizeof(sc_cur->srcv4_addr)); sc_cur->v4prefixlen = args.v4_prefixlen; SDT_PROBE3(if_stf, , ioctl, sv4net, sc_cur->srcv4_addr, sc_cur->srcv4_addr, sc_cur->v4prefixlen); } else if (ifd->ifd_cmd == STF6RD_SBR) { if (ifd->ifd_len != sizeof(args)) { error = EINVAL; break; } bzero(&args, sizeof(args)); error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) break; sc_cur->braddr = args.braddr.s_addr; SDT_PROBE1(if_stf, , ioctl, sdstv4, sc_cur->braddr); } else error = EINVAL; break; case SIOCGDRVSPEC: ifd = (struct ifdrv *)data; if (ifd->ifd_cmd != STF6RD_GV4NET) { error = EINVAL; break; } if (ifd->ifd_len != sizeof(args)) { error = EINVAL; break; } bzero(&args, sizeof(args)); args.srcv4_addr.s_addr = sc_cur->srcv4_addr; args.braddr.s_addr = sc_cur->braddr; args.v4_prefixlen = sc_cur->v4prefixlen; error = copyout(&args, ifd->ifd_data, ifd->ifd_len); break; case SIOCSIFADDR: ifa = (struct ifaddr *)data; SDT_PROBE1(if_stf, , ioctl, ifaddr, ifa); if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } if (stf_getin4addr(sc_cur, &sin4, satosin6(ifa->ifa_addr)->sin6_addr, satosin6(ifa->ifa_netmask)->sin6_addr) == NULL) { error = EINVAL; break; } ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; break; case SIOCADDMULTI: case SIOCDELMULTI: ifr = (struct ifreq *)data; if (ifr && ifr->ifr_addr.sa_family == AF_INET6) ; else error = EAFNOSUPPORT; break; case SIOCGIFMTU: break; case SIOCSIFMTU: ifr = (struct ifreq *)data; mtu = ifr->ifr_mtu; /* RFC 4213 3.2 ideal world MTU */ if (mtu < IPV6_MINMTU || mtu > IF_MAXMTU - 20) return (EINVAL); ifp->if_mtu = mtu; break; default: error = EINVAL; break; } return (error); } diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index 5e1e60933caa..8328f9f94442 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -1,2011 +1,2019 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #ifdef INET6 #include #include #endif #include #include #include #include #include #include #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_alias; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_UNUSED1 0x0008 #define TUN_UNUSED2 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ int tun_busy; /* busy count */ int tun_vhdrlen; /* virtio-net header length */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED); #define TUN_VMIO_FLAG_MASK 0x0fff /* * Interface capabilities of a tap device that supports the virtio-net * header. */ #define TAP_VNET_HDR_CAPS (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 \ | IFCAP_VLAN_HWCSUM \ | IFCAP_TSO | IFCAP_LRO \ | IFCAP_VLAN_HWTSO) #define TAP_ALL_OFFLOAD (CSUM_TSO | CSUM_TCP | CSUM_UDP |\ CSUM_TCP_IPV6 | CSUM_UDP_IPV6) /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag arrival_tag; static eventhandler_tag clone_tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user devfs cloning */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP tunnel software network interface"); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation"); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Enable legacy devfs interface creation for all users"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name); static int tun_busy_locked(struct tuntap_softc *tp); static void tun_unbusy_locked(struct tuntap_softc *tp); static int tun_busy(struct tuntap_softc *tp); static void tun_unbusy(struct tuntap_softc *tp); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev); static void tundtor(void *data); static void tunrename(void *arg, struct ifnet *ifp); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); -static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); -static int tun_clone_destroy(struct if_clone *, struct ifnet *); +static int tun_clone_create(struct if_clone *, char *, size_t, + struct ifc_data *, struct ifnet **); +static int tun_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen); static d_open_t tunopen; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; - ifc_match_t *clone_match_fn; - ifc_create_t *clone_create_fn; - ifc_destroy_t *clone_destroy_fn; + ifc_match_f *clone_match_fn; + ifc_create_f *clone_create_fn; + ifc_destroy_f *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Mechanism for marking a tunnel device as busy so that we can safely do some * orthogonal operations (such as operations on devices) without racing against * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or * open, to be woken up when the condition is alleviated. */ static int tun_busy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); if ((tp->tun_flags & TUN_DYING) != 0) { /* * Perhaps unintuitive, but the device is busy going away. * Other interpretations of EBUSY from tun_busy make little * sense, since making a busy device even more busy doesn't * sound like a problem. */ return (EBUSY); } ++tp->tun_busy; return (0); } static void tun_unbusy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel")); --tp->tun_busy; /* Wake up anything that may be waiting on our busy tunnel. */ if (tp->tun_busy == 0) cv_broadcast(&tp->tun_cv); } static int tun_busy(struct tuntap_softc *tp) { int ret; TUN_LOCK(tp); ret = tun_busy_locked(tp); TUN_UNLOCK(tp); return (ret); } static void tun_unbusy(struct tuntap_softc *tp) { TUN_LOCK(tp); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int -tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +tun_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that's okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ dev = NULL; i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); /* No preexisting struct cdev *, create one */ if (i != 0) i = tun_create_device(drv, unit, NULL, &dev, name); - if (i == 0) + if (i == 0) { tuncreate(dev); + struct tuntap_softc *tp = dev->si_drv1; + *ifpp = tp->tun_ifp; + } return (i); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } i = tun_create_device(drv, u, cred, dev, name); } if (i == 0) if_clone_create(name, namelen, NULL); out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if (tp->tun_busy != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); /* destroy_dev will take care of any alias. */ destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } sx_xlock(&tun_ioctl_sx); TUN2IFP(tp)->if_softc = NULL; sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int -tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) +tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp, uint32_t flags) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; - drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, - drv->clone_match_fn, drv->clone_create_fn, - drv->clone_destroy_fn); + struct if_clone_addreq req = { + .match_f = drv->clone_match_fn, + .create_f = drv->clone_create_fn, + .destroy_f = drv->clone_destroy_fn, + }; + drvc->cloner = ifc_attach_cloner(drv->cdevsw.d_name, &req); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static struct tuntap_driver * tuntap_driver_from_ifnet(const struct ifnet *ifp) { struct tuntap_driver *drv; int i; if (ifp == NULL) return (NULL); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0) return (drv); } return (NULL); } static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, tunrename, 0, 1000); if (arrival_tag == NULL) return (ENOMEM); clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (clone_tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; /* We'll only ever have these two, so no need for a macro. */ static moduledata_t tun_mod = { "if_tun", NULL, 0 }; static moduledata_t tap_mod = { "if_tap", NULL, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tun, 1); DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tap, 1); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name) { struct make_dev_args args; struct tuntap_softc *tp; int error; tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO); mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&tp->tun_cv, "tun_condvar"); tp->tun_flags = drv->ident_flags; tp->tun_drv = drv; make_dev_args_init(&args); if (cr != NULL) args.mda_flags = MAKEDEV_REF; args.mda_devsw = &drv->cdevsw; args.mda_cr = cr; args.mda_uid = UID_UUCP; args.mda_gid = GID_DIALER; args.mda_mode = 0600; args.mda_unit = unit; args.mda_si_drv1 = tp; error = make_dev_s(&args, dev, "%s", name); if (error != 0) { free(tp, M_TUN); return (error); } KASSERT((*dev)->si_drv1 != NULL, ("Failed to set si_drv1 at %s creation", name)); tp->tun_dev = *dev; knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx); mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); return (0); } static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev) { struct tuntap_driver *drv; struct tuntap_softc *tp; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); drv = tp->tun_drv; iflags = IFF_MULTICAST; if ((tp->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = tp->tun_ifp = if_alloc(type); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", drv->cdevsw.d_name, dev2unit(dev)); ifp->if_softc = tp; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if ((tp->tun_flags & TUN_L2) != 0) { ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } TUN_LOCK(tp); tp->tun_flags |= TUN_INITED; TUN_UNLOCK(tp); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static void tunrename(void *arg __unused, struct ifnet *ifp) { struct tuntap_softc *tp; int error; if ((ifp->if_flags & IFF_RENAMING) == 0) return; if (tuntap_driver_from_ifnet(ifp) == NULL) return; /* * We need to grab the ioctl sx long enough to make sure the softc is * still there. If it is, we can safely try to busy the tun device. * The busy may fail if the device is currently dying, in which case * we do nothing. If it doesn't fail, the busy count stops the device * from dying until we've created the alias (that will then be * subsequently destroyed). */ sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { sx_xunlock(&tun_ioctl_sx); return; } error = tun_busy(tp); sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { destroy_dev(tp->tun_alias); tp->tun_alias = NULL; } if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0) goto out; /* * Failure's ok, aliases are created on a best effort basis. If a * tun user/consumer decides to rename the interface to conflict with * another device (non-ifnet) on the system, we will assume they know * what they are doing. make_dev_alias_p won't touch tun_alias on * failure, so we use it but ignore the return value. */ make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s", ifp->if_xname); out: tun_unbusy(tp); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_softc *tp; int error __diagused, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); TUN_LOCK(tp); if ((tp->tun_flags & TUN_INITED) == 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (ENXIO); } if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } error = tun_busy_locked(tp); KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); /* * This can fail with either ENOENT or EBUSY. This is in the middle of * d_open, so ENOENT should not be possible. EBUSY is possible, but * the only cdevpriv dtor being set will be tundtor and the softc being * passed is constant for a given cdev. We ignore the possible error * because of this as either "unlikely" or "not actually a problem." */ (void)devfs_set_cdevpriv(tp, tundtor); CURVNET_RESTORE(); return (0); } /* * tundtor - tear down the device - mark i/f down & delete * routing info */ static void tundtor(void *data) { struct proc *p; struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; tp = data; p = curproc; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Realistically, we can't be obstinate here. This only means that the * tuntap device was closed out of order, and the last closer wasn't the * controller. These are still good to know about, though, as software * should avoid multiple processes with a tuntap device open and * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in * parent). */ if (p->p_pid != tp->tun_pid) { log(LOG_INFO, "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n", p->p_pid, p->p_comm, tp->tun_dev->si_name); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; tun_vnethdr_set(ifp, 0); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); TUN_UNLOCK(tp); } else { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } /* * To be called under TUN_LOCK. Update ifp->if_hwassist according to the * current value of ifp->if_capenable. */ static void tun_caps_changed(struct ifnet *ifp) { uint64_t hwassist = 0; TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc); if (ifp->if_capenable & IFCAP_TXCSUM) hwassist |= CSUM_TCP | CSUM_UDP; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6; if (ifp->if_capenable & IFCAP_TSO4) hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) hwassist |= CSUM_IP6_TSO; ifp->if_hwassist = hwassist; } /* * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust * if_capabilities and if_capenable as needed. */ static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen) { struct tuntap_softc *tp = ifp->if_softc; TUN_LOCK_ASSERT(tp); if (tp->tun_vhdrlen == vhdrlen) return; /* * Update if_capabilities to reflect the * functionalities offered by the virtio-net * header. */ if (vhdrlen != 0) ifp->if_capabilities |= TAP_VNET_HDR_CAPS; else ifp->if_capabilities &= ~TAP_VNET_HDR_CAPS; /* * Disable any capabilities that we don't * support anymore. */ ifp->if_capenable &= ifp->if_capabilities; tun_caps_changed(ifp); tp->tun_vhdrlen = vhdrlen; TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n", vhdrlen, ifp->if_capabilities); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; case SIOCSIFCAP: TUN_LOCK(tp); ifp->if_capenable = ifr->ifr_reqcap; tun_caps_changed(ifp); TUN_UNLOCK(tp); VLAN_CAPABILITIES(ifp); break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct tuninfo *tunp; int error, iflags, ival; bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); ifp->if_flags = iflags | (ifp->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case TAPSVNETHDR: ival = *(int *)data; if (ival != 0 && ival != sizeof(struct virtio_net_hdr) && ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) { return (EINVAL); } TUN_LOCK(tp); tun_vnethdr_set(ifp, ival); TUN_UNLOCK(tp); return (0); case TAPGVNETHDR: TUN_LOCK(tp); *(int *)data = tp->tun_vhdrlen; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { case TUNGIFNAME: ifrp = (struct ifreq *)data; strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); return (0); case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; size_t len; int error = 0; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m != NULL) break; if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } TUN_UNLOCK(tp); if ((tp->tun_flags & TUN_L2) != 0) BPF_MTAP(ifp, m); len = min(tp->tun_vhdrlen, uio->uio_resid); if (len > 0) { struct virtio_net_hdr_mrg_rxbuf vhdr; bzero(&vhdr, sizeof(vhdr)); if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) { m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr); } TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); error = uiomove(&vhdr, len, uio); } while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } static int tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m, struct virtio_net_hdr_mrg_rxbuf *vhdr) { struct epoch_tracker et; struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } if (vhdr != NULL && virtio_net_rx_csum(m, &vhdr->hdr)) { m_freem(m); return (0); } /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); NET_EPOCH_ENTER(et); (*ifp->if_input)(ifp, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { struct epoch_tracker et; struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct virtio_net_hdr_mrg_rxbuf vhdr; struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; int align, vhdrlen, error; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; mru = l2tun ? TAPMRU : TUNMRU; vhdrlen = tp->tun_vhdrlen; align = 0; if (l2tun) { align = ETHER_ALIGN; mru += vhdrlen; } else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if (vhdrlen > 0) { error = uiomove(&vhdr, vhdrlen, uio); if (error != 0) return (error); TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); } if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 58df9b21fc20..6a2d1bfb3fd1 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1,2351 +1,2357 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * Copyright 2012 ADARA Networks, Inc. * Copyright 2017 Dell EMC Isilon * * Portions of this software were developed by Robert N. M. Watson under * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. * This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that * ether_output() sends to us via if_transmit(), rewrite them for * use by the real outgoing interface, and ask it to send them. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_vlan.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ #else struct ifvlanhead *hash; /* dynamic hash-list table */ uint16_t hmask; uint16_t hwidth; #endif int refcnt; }; #if defined(KERN_TLS) || defined(RATELIMIT) struct vlan_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; static inline struct vlan_snd_tag * mst_to_vst(struct m_snd_tag *mst) { return (__containerof(mst, struct vlan_snd_tag, com)); } #endif /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. */ #ifdef VLAN_ARRAY #define VLAN_FOREACH(_ifv, _trunk) \ size_t _i; \ for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i]) != NULL) #else /* VLAN_ARRAY */ #define VLAN_FOREACH(_ifv, _trunk) \ struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* * This macro provides a facility to iterate over every vlan on a trunk while * also modifying the number of vlans on the trunk. The iteration continues * until some condition is met or there are no more vlans on the trunk. */ #ifdef VLAN_ARRAY /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i])) #else /* VLAN_ARRAY */ /* * The hash table case is more complicated. We allow for the hash table to be * modified (i.e. vlans removed) while we are iterating over it. To allow for * this we must restart the iteration every time we "touch" something during * the iteration, since removal will resize the hash table and invalidate our * current position. If acting on the touched element causes the trunk to be * emptied, then iteration also stops. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ bool _touch = false; \ for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; struct epoch_context mc_epoch_ctx; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) (TRUNK(ifv)->parent) void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; int ifv_encaplen; /* encapsulation length */ int ifv_mtufudge; /* MTU fudged by this much */ int ifv_mintu; /* min transmission unit */ struct ether_8021q_tag ifv_qtag; #define ifv_proto ifv_qtag.proto #define ifv_vid ifv_qtag.vid #define ifv_pcp ifv_qtag.pcp struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; /* Special flags we should propagate to parent. */ static struct { int flag; int (*func)(struct ifnet *, int); } vlan_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; VNET_DECLARE(int, vlan_mtag_pcp); #define V_vlan_mtag_pcp VNET(vlan_mtag_pcp) static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; static eventhandler_tag ifevent_tag; /* * if_vlan uses two module-level synchronizations primitives to allow concurrent * modification of vlan interfaces and (mostly) allow for vlans to be destroyed * while they are being used for tx/rx. To accomplish this in a way that has * acceptable performance and cooperation with other parts of the network stack * there is a non-sleepable epoch(9) and an sx(9). * * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * */ #define _VLAN_SX_ID ifv_sx static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ sx_init_flags(&_VLAN_SX_ID, "vlan_sx", SX_RECURSE) #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) #define VLAN_XUNLOCK() sx_xunlock(&_VLAN_SX_ID) #define VLAN_SLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_SLOCKED) #define VLAN_XLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_XLOCKED) #define VLAN_SXLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_LOCKED) /* * We also have a per-trunk mutex that should be acquired when changing * its state. */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array * with 4096 entries. In theory this can give a boost in processing, * however in practice it does not. Probably this is because the array * is too big to fit into CPU cache. */ #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); static void vlan_freehash(struct ifvlantrunk *trunk); static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int vlan_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int vlan_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void vlan_snd_tag_free(struct m_snd_tag *); static struct m_snd_tag *vlan_next_snd_tag(struct m_snd_tag *); static void vlan_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); #ifdef ALTQ static void vlan_altq_start(struct ifnet *ifp); static int vlan_altq_transmit(struct ifnet *ifp, struct mbuf *m); #endif static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag, uint16_t proto); static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); static struct ifnet *vlan_clone_match_ethervid(const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); -static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); -static int vlan_clone_destroy(struct if_clone *, struct ifnet *); +static int vlan_clone_create(struct if_clone *, char *, size_t, + struct ifc_data *, struct ifnet **); +static int vlan_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); static void vlan_ifevent(void *arg, struct ifnet *ifp, int event); static void vlan_lladdr_fn(void *arg, int pending); static struct if_clone *vlan_cloner; #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif #ifdef RATELIMIT static const struct if_snd_tag_sw vlan_snd_tag_ul_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_UNLIMITED }; static const struct if_snd_tag_sw vlan_snd_tag_rl_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_RATE_LIMIT }; #endif #ifdef KERN_TLS static const struct if_snd_tag_sw vlan_snd_tag_tls_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_TLS }; #ifdef RATELIMIT static const struct if_snd_tag_sw vlan_snd_tag_tls_rl_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT }; #endif #endif static void vlan_mc_free(struct epoch_context *ctx) { struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx); free(mc, M_VLAN); } #ifndef VLAN_ARRAY #define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) static void vlan_inithash(struct ifvlantrunk *trunk) { int i, n; /* * The trunk must not be locked here since we call malloc(M_WAITOK). * It is OK in case this function is called before the trunk struct * gets hooked up and becomes visible from other threads. */ KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, ("%s: hash already initialized", __func__)); trunk->hwidth = VLAN_DEF_HWIDTH; n = 1 << trunk->hwidth; trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) CK_SLIST_INIT(&trunk->hash[i]); } static void vlan_freehash(struct ifvlantrunk *trunk) { #ifdef INVARIANTS int i; KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); trunk->hash = NULL; trunk->hwidth = trunk->hmask = 0; } static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* * Grow the hash when the number of vlans exceeds half of the number of * hash buckets squared. This will make the average linked-list length * buckets/2. */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); } static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << (trunk->hwidth - 1); i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); } panic("%s: vlan not found\n", __func__); return (ENOENT); /*NOTREACHED*/ } /* * Grow the hash larger or smaller if memory permits. */ static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch) { struct ifvlan *ifv; struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { /* Harmless yet obvious coding error */ printf("%s: howmuch is 0\n", __func__); return; } hwidth2 = trunk->hwidth + howmuch; n = 1 << trunk->hwidth; n2 = 1 << hwidth2; /* Do not shrink the table below the default */ if (hwidth2 < VLAN_DEF_HWIDTH) return; hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; trunk->hmask = n2 - 1; if (bootverbose) if_printf(trunk->parent, "VLAN hash table resized from %d to %d buckets\n", n, n2); } static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); } #if 0 /* Debugging code to view the hashtables. */ static void vlan_dumphash(struct ifvlantrunk *trunk) { int i; struct ifvlan *ifv; for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } } #endif /* 0 */ #else static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); } static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); } static __inline void vlan_freehash(struct ifvlantrunk *trunk) { } static __inline void vlan_inithash(struct ifvlantrunk *trunk) { } #endif /* !VLAN_ARRAY */ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; TRUNK_LOCK_DESTROY(trunk); if_rele(trunk->parent); free(trunk, M_VLAN); } /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the * side effect of causing the parent interface to receive multicast * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } /* Now program new ones. */ IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(ifp); CURVNET_RESTORE(); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) { CURVNET_RESTORE(); return (error); } } CURVNET_RESTORE(); return (0); } /* * A handler for interface ifnet events. */ static void vlan_ifevent(void *arg __unused, struct ifnet *ifp, int event) { struct epoch_tracker et; struct ifvlan *ifv; struct ifvlantrunk *trunk; if (event != IFNET_EVENT_UPDATE_BAUDRATE) return; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = ifp->if_baudrate; } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for parent interface link layer address changes. * If the parent interface link layer address is changed we * should also change it on all children vlans. */ static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; /* Need the epoch since this is run on taskqueue_swi. */ NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } /* * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR * ioctl calls on the parent garbling the lladdr of the child vlan. */ TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { /* * Copy new new lladdr into the ifv_ifp, enqueue a task * to actually call if_setlladdr. if_setlladdr needs to * be deferred to a taskqueue because it will call into * the if_vlan ioctl path and try to acquire the global * lock. */ ifv_ifp = ifv->ifv_ifp; bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp), ifp->if_addrlen); sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr; sdl->sdl_alen = ifp->if_addrlen; taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for network interface departure events. * Track departure of trunks here so that we don't access invalid * pointers or whatever if a trunk is ripped from under us, e.g., * by ejecting its hot-plug card. However, if an ifnet is simply * being renamed, then there's no need to tear down the state. */ static void vlan_ifdetach(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; struct ifvlantrunk *trunk; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; VLAN_XLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_XUNLOCK(); return; } /* * OK, it's a trunk. Loop over and detach all vlan's on it. * Check trunk pointer after each vlan_unconfig() as it will * free it and set to NULL after the last vlan was detached. */ VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk, ifp->if_vlantrunk == NULL) vlan_unconfig_locked(ifv->ifv_ifp, 1); /* Trunk should have been destroyed in vlan_unconfig(). */ KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); VLAN_XUNLOCK(); } /* * Return the trunk device for a virtual interface. */ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); return (ifp); } /* * Return the 12-bit VLAN VID for this interface, for use by external * components such as Infiniband. * * XXXRW: Note that the function name here is historical; it should be named * vlan_vid(). */ static int vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *vidp = ifv->ifv_vid; return (0); } static int vlan_pcp(struct ifnet *ifp, uint16_t *pcpp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *pcpp = ifv->ifv_pcp; return (0); } /* * Return a driver specific cookie for this interface. Synchronization * with setcookie must be provided by the driver. */ static void * vlan_cookie(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; return (ifv->ifv_cookie); } /* * Store a cookie in our softc that drivers can use to store driver * private per-instance data in. */ static int vlan_setcookie(struct ifnet *ifp, void *cookie) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; ifv->ifv_cookie = cookie; return (0); } /* * Return the vlan device present at the specific VID. */ static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) return (NULL); ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; return (ifp); } /* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and * set here. No one else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* For if_link_state_change() eyes only... */ extern void (*vlan_link_state_p)(struct ifnet *); +static struct if_clone_addreq vlan_addreq = { + .match_f = vlan_clone_match, + .create_f = vlan_clone_create, + .destroy_f = vlan_clone_destroy, +}; + static int vlan_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (ifdetach_tag == NULL) return (ENOMEM); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); if (iflladdr_tag == NULL) return (ENOMEM); ifevent_tag = EVENTHANDLER_REGISTER(ifnet_event, vlan_ifevent, NULL, EVENTHANDLER_PRI_ANY); if (ifevent_tag == NULL) return (ENOMEM); VLAN_LOCKING_INIT(); vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; vlan_trunkdev_p = vlan_trunkdev; vlan_cookie_p = vlan_cookie; vlan_setcookie_p = vlan_setcookie; vlan_tag_p = vlan_tag; vlan_pcp_p = vlan_pcp; vlan_devat_p = vlan_devat; #ifndef VIMAGE - vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, - vlan_clone_create, vlan_clone_destroy); + vlan_cloner = ifc_attach_cloner(vlanname, &vlan_addreq); #endif if (bootverbose) printf("vlan: initialized, using " #ifdef VLAN_ARRAY "full-size arrays" #else "hash tables with chaining" #endif "\n"); break; case MOD_UNLOAD: #ifndef VIMAGE - if_clone_detach(vlan_cloner); + ifc_detach_cloner(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); EVENTHANDLER_DEREGISTER(ifnet_event, ifevent_tag); vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; vlan_cookie_p = NULL; vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCKING_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t vlan_mod = { "if_vlan", vlan_modevent, 0 }; DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vlan, 3); #ifdef VIMAGE static void vnet_vlan_init(const void *unused __unused) { - - vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, - vlan_clone_create, vlan_clone_destroy); + vlan_cloner = ifc_attach_cloner(vlanname, &vlan_addreq); V_vlan_cloner = vlan_cloner; } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); static void vnet_vlan_uninit(const void *unused __unused) { - if_clone_detach(V_vlan_cloner); + ifc_detach_cloner(V_vlan_cloner); } VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_vlan_uninit, NULL); #endif /* * Check for .[. ...] style interface names. */ static struct ifnet * vlan_clone_match_ethervid(const char *name, int *vidp) { char ifname[IFNAMSIZ]; char *cp; struct ifnet *ifp; int vid; strlcpy(ifname, name, IFNAMSIZ); if ((cp = strrchr(ifname, '.')) == NULL) return (NULL); *cp = '\0'; if ((ifp = ifunit_ref(ifname)) == NULL) return (NULL); /* Parse VID. */ if (*++cp == '\0') { if_rele(ifp); return (NULL); } vid = 0; for(; *cp >= '0' && *cp <= '9'; cp++) vid = (vid * 10) + (*cp - '0'); if (*cp != '\0') { if_rele(ifp); return (NULL); } if (vidp != NULL) *vidp = vid; return (ifp); } static int vlan_clone_match(struct if_clone *ifc, const char *name) { struct ifnet *ifp; const char *cp; ifp = vlan_clone_match_ethervid(name, NULL); if (ifp != NULL) { if_rele(ifp); return (1); } if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int -vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +vlan_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { char *dp; bool wildcard = false; bool subinterface = false; int unit; int error; int vid = 0; uint16_t proto = ETHERTYPE_VLAN; struct ifvlan *ifv; struct ifnet *ifp; struct ifnet *p = NULL; struct ifaddr *ifa; struct sockaddr_dl *sdl; struct vlanreq vlr; static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * There are three ways to specify the cloned device: * o pass a parameter block with the clone request. * o specify parameters in the text of the clone device name * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter two are supported * for backwards compatibility. * * XXXRW: Note historic use of the word "tag" here. New ioctls may be * called for. */ - if (params) { - error = copyin(params, &vlr, sizeof(vlr)); + if (ifd->params != NULL) { + error = ifc_copyin(ifd, &vlr, sizeof(vlr)); if (error) return error; vid = vlr.vlr_tag; proto = vlr.vlr_proto; #ifdef COMPAT_FREEBSD12 if (proto == 0) proto = ETHERTYPE_VLAN; #endif p = ifunit_ref(vlr.vlr_parent); if (p == NULL) return (ENXIO); } if ((error = ifc_name2unit(name, &unit)) == 0) { /* * vlanX interface. Set wildcard to true if the unit number * is not fixed (-1) */ wildcard = (unit < 0); } else { struct ifnet *p_tmp = vlan_clone_match_ethervid(name, &vid); if (p_tmp != NULL) { error = 0; subinterface = true; unit = IF_DUNIT_NONE; wildcard = false; if (p != NULL) { if_rele(p_tmp); if (p != p_tmp) error = EINVAL; } else p = p_tmp; } else error = ENXIO; } if (error != 0) { if (p != NULL) if_rele(p); return (error); } if (!subinterface) { /* vlanX interface, mark X as busy or allocate new unit # */ error = ifc_alloc_unit(ifc, &unit); if (error != 0) { if (p != NULL) if_rele(p); return (error); } } /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { panic("%s: interface name too long", __func__); } } ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { if (!subinterface) ifc_free_unit(ifc, unit); free(ifv, M_VLAN); if (p != NULL) if_rele(p); return (ENOSPC); } CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; ifp->if_init = vlan_init; #ifdef ALTQ ifp->if_start = vlan_altq_start; ifp->if_transmit = vlan_altq_transmit; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); #else ifp->if_transmit = vlan_transmit; #endif ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; ifp->if_ratelimit_query = vlan_ratelimit_query; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_L2VLAN; if (p != NULL) { error = vlan_config(ifv, p, vid, proto); if_rele(p); if (error != 0) { /* * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ ether_ifdetach(ifp); vlan_unconfig(ifp); if_free(ifp); if (!subinterface) ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (error); } } + *ifpp = ifp; return (0); } static int -vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct ifvlan *ifv = ifp->if_softc; int unit = ifp->if_dunit; if (ifp->if_vlantrunk) return (EBUSY); #ifdef ALTQ IFQ_PURGE(&ifp->if_snd); #endif ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ /* * We should have the only reference to the ifv now, so we can now * drain any remaining lladdr task before freeing the ifnet and the * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); if (unit != IF_DUNIT_NONE) ifc_free_unit(ifc, unit); return (0); } /* * The ifp->if_init entry point for vlan(4) is a no-op. */ static void vlan_init(void *foo __unused) { } /* * The if_transmit method for vlan(4) interface. */ static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; NET_EPOCH_ASSERT(); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } p = PARENT(ifv); len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; BPF_MTAP(ifp, m); #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct vlan_snd_tag *vst; struct m_snd_tag *mst; MPASS(m->m_pkthdr.snd_tag->ifp == ifp); mst = m->m_pkthdr.snd_tag; vst = mst_to_vst(mst); if (vst->tag->ifp != p) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); m_snd_tag_rele(mst); } #endif /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, &ifv->ifv_qtag)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (0); } /* * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ifvlan *ifv; struct ifnet *p; NET_EPOCH_ASSERT(); /* * Find the first non-VLAN parent interface. */ ifv = ifp->if_softc; do { if (TRUNK(ifv) == NULL) { m_freem(m); return (ENETDOWN); } p = PARENT(ifv); ifv = p->if_softc; } while (p->if_type == IFT_L2VLAN); return p->if_output(ifp, m, dst, ro); } #ifdef ALTQ static void vlan_altq_start(if_t ifp) { struct ifaltq *ifq = &ifp->if_snd; struct mbuf *m; IFQ_LOCK(ifq); IFQ_DEQUEUE_NOLOCK(ifq, m); while (m != NULL) { vlan_transmit(ifp, m); IFQ_DEQUEUE_NOLOCK(ifq, m); } IFQ_UNLOCK(ifq); } static int vlan_altq_transmit(if_t ifp, struct mbuf *m) { int err; if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_ENQUEUE(&ifp->if_snd, m, err); if (err == 0) vlan_altq_start(ifp); } else err = vlan_transmit(ifp, m); return (err); } #endif /* ALTQ */ /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ static void vlan_qflush(struct ifnet *ifp __unused) { } static void vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { m_freem(m); return; } if (m->m_flags & M_VLANTAG) { /* * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; /* * Packet is tagged in-band as specified by 802.1q. */ switch (ifp->if_type) { case IFT_ETHER: if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); return; } evl = mtod(m, struct ether_vlan_header *); tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); break; default: #ifdef INVARIANTS panic("%s: %s has unsupported if_type %u", __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } } vid = EVL_VLANOFTAG(tag); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } if (V_vlan_mtag_pcp) { /* * While uncommon, it is possible that we will find a 802.1q * packet encapsulated inside another packet that also had an * 802.1q header. For example, ethernet tunneled over IPSEC * arriving over ethernet. In that case, we replace the * existing 802.1q PCP m_tag value. */ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); } m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); } static void vlan_lladdr_fn(void *arg, int pending __unused) { struct ifvlan *ifv; struct ifnet *ifp; ifv = (struct ifvlan *)arg; ifp = ifv->ifv_ifp; CURVNET_SET(ifp->if_vnet); /* The ifv_ifp already has the lladdr copied in. */ if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen); CURVNET_RESTORE(); } static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid, uint16_t proto) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; /* * We can handle non-ethernet hardware types as long as * they handle the tagging and headers themselves. */ if (p->if_type != IFT_ETHER && p->if_type != IFT_L2VLAN && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. * VID numbers 0x0 and 0xFFF are reserved. */ if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK)) return (EINVAL); if (ifv->ifv_trunk) { trunk = ifv->ifv_trunk; if (trunk->parent != p) return (EBUSY); VLAN_XLOCK(); ifv->ifv_proto = proto; if (ifv->ifv_vid != vid) { /* Re-hash */ vlan_remhash(trunk, ifv); ifv->ifv_vid = vid; error = vlan_inshash(trunk, ifv); } /* Will unlock */ goto done; } VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); TRUNK_WUNLOCK(trunk); } else { trunk = p->if_vlantrunk; } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ ifv->ifv_pcp = 0; /* Default: best effort delivery. */ error = vlan_inshash(trunk, ifv); if (error) goto done; ifv->ifv_proto = proto; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; ifv->ifv_capenable = -1; /* * If the parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, * use it. */ if (p->if_capenable & IFCAP_VLAN_MTU) { /* * No need to fudge the MTU since the parent can * handle extended frames. */ ifv->ifv_mtufudge = 0; } else { /* * Fudge the MTU by the encapsulation size. This * makes us incompatible with strictly compliant * 802.1Q implementations, but allows us to use * the feature with other NetBSD implementations, * which might still be useful. */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; /* * Initialize fields from our parent. This duplicates some * work with ether_ifattach() but allows for non-ethernet * interfaces to also work. */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; ifp->if_broadcastaddr = p->if_broadcastaddr; ifp->if_pcp = ifv->ifv_pcp; /* * We wrap the parent's if_output using vlan_output to ensure that it * can't become stale. */ ifp->if_output = vlan_output; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ #define VLAN_COPY_FLAGS (IFF_SIMPLEX) ifp->if_flags &= ~VLAN_COPY_FLAGS; ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; #undef VLAN_COPY_FLAGS ifp->if_link_state = p->if_link_state; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying * physical interface's. */ TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; /* * Do not schedule link address update if it was the same * as previous parent's. This helps avoid updating for each * associated llentry. */ if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) { bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); /* * Configure multicast addresses that may already be * joined on the vlan device. */ (void)vlan_setmulti(ifp); done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); return (error); } static void vlan_unconfig(struct ifnet *ifp) { VLAN_XLOCK(); vlan_unconfig_locked(ifp, 0); VLAN_XUNLOCK(); } static void vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; int error; VLAN_XLOCK_ASSERT(); ifv = ifp->if_softc; trunk = ifv->ifv_trunk; parent = NULL; if (trunk != NULL) { parent = trunk->parent; /* * Since the interface is being unconfigured, we need to * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already * been removed. Warn about errors if * if_delmulti() does fail, but don't abort as * all callers expect vlan destruction to * succeed. */ if (!departing) { error = if_delmulti(parent, (struct sockaddr *)&mc->mc_addr); if (error) if_printf(ifp, "Failed to delete multicast address from parent: %d\n", error); } CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } vlan_setflags(ifp, 0); /* clear special flags on parent */ vlan_remhash(trunk, ifv); ifv->ifv_trunk = NULL; /* * Check if we were the last. */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; NET_EPOCH_WAIT(); trunk_destroy(trunk); } } /* Disconnect from parent. */ if (ifv->ifv_pflags) if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); ifp->if_mtu = ETHERMTU; ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Only dispatch an event if vlan was * attached, otherwise there is nothing * to cleanup anyway. */ if (parent != NULL) EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)) { struct ifvlan *ifv; int error; VLAN_SXLOCK_ASSERT(); ifv = ifp->if_softc; status = status ? (ifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded parent's status is different from what * we want it to be. If it is, flip it. We record parent's * status in ifv_pflags so that we won't clear parent's flag * we haven't set. In fact, we don't clear or set parent's * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual parent's flags. */ if (status != (ifv->ifv_pflags & flag)) { error = (*func)(PARENT(ifv), status); if (error) return (error); ifv->ifv_pflags &= ~flag; ifv->ifv_pflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the parent: * if "status" is true, update parent's flags respective to our if_flags; * if "status" is false, forcedly clear the flags set on parent. */ static int vlan_setflags(struct ifnet *ifp, int status) { int error, i; for (i = 0; vlan_pflags[i].flag; i++) { error = vlan_setflag(ifp, vlan_pflags[i].flag, status, vlan_pflags[i].func); if (error) return (error); } return (0); } /* Inform all vlans that their parent has changed link state */ static void vlan_link_state(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; if_link_state_change(ifv->ifv_ifp, trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p; struct ifnet *ifp; struct ifnet_hw_tsomax hw_tsomax; int cap = 0, ena = 0, mena; u_long hwa = 0; NET_EPOCH_ASSERT(); VLAN_SXLOCK_ASSERT(); p = PARENT(ifv); ifp = ifv->ifv_ifp; /* Mask parent interface enabled capabilities disabled by user. */ mena = p->if_capenable & ifv->ifv_capenable; /* * If the parent interface can do checksum offloading * on VLANs, then propagate its hardware-assisted * checksumming flags. Also assert that checksum * offloading requires hardware VLAN tagging. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (p->if_capenable & IFCAP_VLAN_HWCSUM && p->if_capenable & IFCAP_VLAN_HWTAGGING) { ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (ena & IFCAP_TXCSUM) hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); if (ena & IFCAP_TXCSUM_IPV6) hwa |= p->if_hwassist & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } /* * If the parent interface can do TSO on VLANs then * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ memset(&hw_tsomax, 0, sizeof(hw_tsomax)); if_hw_tsomax_common(p, &hw_tsomax); if_hw_tsomax_update(ifp, &hw_tsomax); if (p->if_capabilities & IFCAP_VLAN_HWTSO) cap |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { ena |= mena & IFCAP_TSO; if (ena & IFCAP_TSO) hwa |= p->if_hwassist & CSUM_TSO; } /* * If the parent interface can do LRO and checksum offloading on * VLANs, then guess it may do LRO on VLANs. False positive here * cost nothing, while false negative may lead to some confusions. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & IFCAP_LRO; if (p->if_capenable & IFCAP_VLAN_HWCSUM) ena |= p->if_capenable & IFCAP_LRO; /* * If the parent interface can offload TCP connections over VLANs then * propagate its TOE capability to the VLAN interface. * * All TOE drivers in the tree today can deal with VLANs. If this * changes then IFCAP_VLAN_TOE should be promoted to a full capability * with its own bit. */ #define IFCAP_VLAN_TOE IFCAP_TOE if (p->if_capabilities & IFCAP_VLAN_TOE) cap |= p->if_capabilities & IFCAP_TOE; if (p->if_capenable & IFCAP_VLAN_TOE) { TOEDEV(ifp) = TOEDEV(p); ena |= mena & IFCAP_TOE; } /* * If the parent interface supports dynamic link state, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_LINKSTATE); ena |= (mena & IFCAP_LINKSTATE); #ifdef RATELIMIT /* * If the parent interface supports ratelimiting, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_TXRTLMT); ena |= (mena & IFCAP_TXRTLMT); #endif /* * If the parent interface supports unmapped mbufs, so does * the VLAN interface. Note that this should be fine even for * interfaces that don't support hardware tagging as headers * are prepended in normal mbufs to unmapped mbufs holding * payload data. */ cap |= (p->if_capabilities & IFCAP_MEXTPG); ena |= (mena & IFCAP_MEXTPG); /* * If the parent interface can offload encryption and segmentation * of TLS records over TCP, propagate it's capability to the VLAN * interface. * * All TLS drivers in the tree today can deal with VLANs. If * this ever changes, then a new IFCAP_VLAN_TXTLS can be * defined. */ if (p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT)) cap |= p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT); if (p->if_capenable & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT)) ena |= mena & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT); ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; } static void vlan_trunk_capabilities(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; VLAN_SLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_SUNLOCK(); return; } NET_EPOCH_ENTER(et); VLAN_FOREACH(ifv, trunk) vlan_capabilities(ifv); NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifnet *p; struct ifreq *ifr; #ifdef INET struct ifaddr *ifa; #endif struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0, oldmtu; ifr = (struct ifreq *)data; #ifdef INET ifa = (struct ifaddr *) data; #endif ifv = ifp->if_softc; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ifp->if_addrlen); break; case SIOCGIFMEDIA: VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { p = PARENT(ifv); if_ref(p); error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); if_rele(p); /* Limit the result to the parent's current config. */ if (error == 0) { struct ifmediareq *ifmr; ifmr = (struct ifmediareq *)data; if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { ifmr->ifm_count = 1; error = copyout(&ifmr->ifm_current, ifmr->ifm_ulist, sizeof(int)); } } } else { error = EINVAL; } VLAN_SUNLOCK(); break; case SIOCSIFMEDIA: error = EINVAL; break; case SIOCSIFMTU: /* * Set the interface MTU. */ VLAN_SLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) { TRUNK_WLOCK(trunk); if (ifr->ifr_mtu > (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; TRUNK_WUNLOCK(trunk); } else error = EINVAL; VLAN_SUNLOCK(); break; case SIOCSETVLAN: #ifdef VIMAGE /* * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN * interface to be delegated to a jail without allowing the * jail to change what underlying interface/VID it is * associated with. We are not entirely convinced that this * is the right way to accomplish that policy goal. */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr)); if (error) break; if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; } p = ifunit_ref(vlr.vlr_parent); if (p == NULL) { error = ENOENT; break; } #ifdef COMPAT_FREEBSD12 if (vlr.vlr_proto == 0) vlr.vlr_proto = ETHERTYPE_VLAN; #endif oldmtu = ifp->if_mtu; error = vlan_config(ifv, p, vlr.vlr_tag, vlr.vlr_proto); if_rele(p); /* * VLAN MTU may change during addition of the vlandev. * If it did, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; case SIOCGETVLAN: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif bzero(&vlr, sizeof(vlr)); VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); vlr.vlr_tag = ifv->ifv_vid; vlr.vlr_proto = ifv->ifv_proto; } VLAN_SUNLOCK(); error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr)); break; case SIOCSIFFLAGS: /* * We should propagate selected flags to the parent, * e.g., promiscuous mode. */ VLAN_XLOCK(); if (TRUNK(ifv) != NULL) error = vlan_setflags(ifp, 1); VLAN_XUNLOCK(); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * If we don't have a parent, just remember the membership for * when we do. * * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ VLAN_XLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) error = vlan_setmulti(ifp); VLAN_XUNLOCK(); break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif ifr->ifr_vlan_pcp = ifv->ifv_pcp; break; case SIOCSVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = priv_check(curthread, PRIV_NET_SETVLANPCP); if (error) break; if (ifr->ifr_vlan_pcp > VLAN_PCP_MAX) { error = EINVAL; break; } ifv->ifv_pcp = ifr->ifr_vlan_pcp; ifp->if_pcp = ifv->ifv_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); break; case SIOCSIFCAP: VLAN_SLOCK(); ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { struct epoch_tracker et; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); } VLAN_SUNLOCK(); break; default: error = EINVAL; break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct epoch_tracker et; const struct if_snd_tag_sw *sw; struct vlan_snd_tag *vst; struct ifvlan *ifv; struct ifnet *parent; struct m_snd_tag *mst; int error; NET_EPOCH_ENTER(et); ifv = ifp->if_softc; switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_UNLIMITED: sw = &vlan_snd_tag_ul_sw; break; case IF_SND_TAG_TYPE_RATE_LIMIT: sw = &vlan_snd_tag_rl_sw; break; #endif #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: sw = &vlan_snd_tag_tls_sw; break; case IF_SND_TAG_TYPE_TLS_RX: sw = NULL; if (params->tls_rx.vlan_id != 0) goto failure; params->tls_rx.vlan_id = ifv->ifv_vid; break; #ifdef RATELIMIT case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: sw = &vlan_snd_tag_tls_rl_sw; break; #endif #endif default: goto failure; } if (ifv->ifv_trunk != NULL) parent = PARENT(ifv); else parent = NULL; if (parent == NULL) goto failure; if_ref(parent); NET_EPOCH_EXIT(et); if (sw != NULL) { vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); if (vst == NULL) { if_rele(parent); return (ENOMEM); } } else vst = NULL; error = m_snd_tag_alloc(parent, params, &mst); if_rele(parent); if (error) { free(vst, M_VLAN); return (error); } if (sw != NULL) { m_snd_tag_init(&vst->com, ifp, sw); vst->tag = mst; *ppmt = &vst->com; } else *ppmt = mst; return (0); failure: NET_EPOCH_EXIT(et); return (EOPNOTSUPP); } static struct m_snd_tag * vlan_next_snd_tag(struct m_snd_tag *mst) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag); } static int vlan_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->sw->snd_tag_modify(vst->tag, params)); } static int vlan_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->sw->snd_tag_query(vst->tag, params)); } static void vlan_snd_tag_free(struct m_snd_tag *mst) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); m_snd_tag_rele(vst->tag); free(vst, M_VLAN); } static void vlan_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) { /* * For vlan, we have an indirect * interface. The caller needs to * get a ratelimit tag on the actual * interface the flow will go on. */ q->rate_table = NULL; q->flags = RT_IS_INDIRECT; q->max_flows = 0; q->number_of_rates = 0; } #endif diff --git a/sys/net/if_vxlan.c b/sys/net/if_vxlan.c index a1e925195d74..6c9a3d7e3095 100644 --- a/sys/net/if_vxlan.c +++ b/sys/net/if_vxlan.c @@ -1,3686 +1,3696 @@ /*- * Copyright (c) 2014, Bryan Venteicher * All rights reserved. * Copyright (c) 2020, Chelsio Communications. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vxlan_softc; LIST_HEAD(vxlan_softc_head, vxlan_softc); struct sx vxlan_sx; SX_SYSINIT(vxlan, &vxlan_sx, "VXLAN global start/stop lock"); struct vxlan_socket_mc_info { union vxlan_sockaddr vxlsomc_saddr; union vxlan_sockaddr vxlsomc_gaddr; int vxlsomc_ifidx; int vxlsomc_users; }; /* * The maximum MTU of encapsulated ethernet frame within IPv4/UDP packet. */ #define VXLAN_MAX_MTU (IP_MAXPACKET - \ 60 /* Maximum IPv4 header len */ - \ sizeof(struct udphdr) - \ sizeof(struct vxlan_header) - \ ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN) #define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU) #define VXLAN_SO_MC_MAX_GROUPS 32 #define VXLAN_SO_VNI_HASH_SHIFT 6 #define VXLAN_SO_VNI_HASH_SIZE (1 << VXLAN_SO_VNI_HASH_SHIFT) #define VXLAN_SO_VNI_HASH(_vni) ((_vni) % VXLAN_SO_VNI_HASH_SIZE) struct vxlan_socket { struct socket *vxlso_sock; struct rmlock vxlso_lock; u_int vxlso_refcnt; union vxlan_sockaddr vxlso_laddr; LIST_ENTRY(vxlan_socket) vxlso_entry; struct vxlan_softc_head vxlso_vni_hash[VXLAN_SO_VNI_HASH_SIZE]; struct vxlan_socket_mc_info vxlso_mc[VXLAN_SO_MC_MAX_GROUPS]; }; #define VXLAN_SO_RLOCK(_vso, _p) rm_rlock(&(_vso)->vxlso_lock, (_p)) #define VXLAN_SO_RUNLOCK(_vso, _p) rm_runlock(&(_vso)->vxlso_lock, (_p)) #define VXLAN_SO_WLOCK(_vso) rm_wlock(&(_vso)->vxlso_lock) #define VXLAN_SO_WUNLOCK(_vso) rm_wunlock(&(_vso)->vxlso_lock) #define VXLAN_SO_LOCK_ASSERT(_vso) \ rm_assert(&(_vso)->vxlso_lock, RA_LOCKED) #define VXLAN_SO_LOCK_WASSERT(_vso) \ rm_assert(&(_vso)->vxlso_lock, RA_WLOCKED) #define VXLAN_SO_ACQUIRE(_vso) refcount_acquire(&(_vso)->vxlso_refcnt) #define VXLAN_SO_RELEASE(_vso) refcount_release(&(_vso)->vxlso_refcnt) struct vxlan_ftable_entry { LIST_ENTRY(vxlan_ftable_entry) vxlfe_hash; uint16_t vxlfe_flags; uint8_t vxlfe_mac[ETHER_ADDR_LEN]; union vxlan_sockaddr vxlfe_raddr; time_t vxlfe_expire; }; #define VXLAN_FE_FLAG_DYNAMIC 0x01 #define VXLAN_FE_FLAG_STATIC 0x02 #define VXLAN_FE_IS_DYNAMIC(_fe) \ ((_fe)->vxlfe_flags & VXLAN_FE_FLAG_DYNAMIC) #define VXLAN_SC_FTABLE_SHIFT 9 #define VXLAN_SC_FTABLE_SIZE (1 << VXLAN_SC_FTABLE_SHIFT) #define VXLAN_SC_FTABLE_MASK (VXLAN_SC_FTABLE_SIZE - 1) #define VXLAN_SC_FTABLE_HASH(_sc, _mac) \ (vxlan_mac_hash(_sc, _mac) % VXLAN_SC_FTABLE_SIZE) LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry); struct vxlan_statistics { uint32_t ftable_nospace; uint32_t ftable_lock_upgrade_failed; counter_u64_t txcsum; counter_u64_t tso; counter_u64_t rxcsum; }; struct vxlan_softc { struct ifnet *vxl_ifp; int vxl_reqcap; u_int vxl_fibnum; struct vxlan_socket *vxl_sock; uint32_t vxl_vni; union vxlan_sockaddr vxl_src_addr; union vxlan_sockaddr vxl_dst_addr; uint32_t vxl_flags; #define VXLAN_FLAG_INIT 0x0001 #define VXLAN_FLAG_TEARDOWN 0x0002 #define VXLAN_FLAG_LEARN 0x0004 #define VXLAN_FLAG_USER_MTU 0x0008 uint32_t vxl_port_hash_key; uint16_t vxl_min_port; uint16_t vxl_max_port; uint8_t vxl_ttl; /* Lookup table from MAC address to forwarding entry. */ uint32_t vxl_ftable_cnt; uint32_t vxl_ftable_max; uint32_t vxl_ftable_timeout; uint32_t vxl_ftable_hash_key; struct vxlan_ftable_head *vxl_ftable; /* Derived from vxl_dst_addr. */ struct vxlan_ftable_entry vxl_default_fe; struct ip_moptions *vxl_im4o; struct ip6_moptions *vxl_im6o; struct rmlock vxl_lock; volatile u_int vxl_refcnt; int vxl_unit; int vxl_vso_mc_index; struct vxlan_statistics vxl_stats; struct sysctl_oid *vxl_sysctl_node; struct sysctl_ctx_list vxl_sysctl_ctx; struct callout vxl_callout; struct ether_addr vxl_hwaddr; int vxl_mc_ifindex; struct ifnet *vxl_mc_ifp; struct ifmedia vxl_media; char vxl_mc_ifname[IFNAMSIZ]; LIST_ENTRY(vxlan_softc) vxl_entry; LIST_ENTRY(vxlan_softc) vxl_ifdetach_list; /* For rate limiting errors on the tx fast path. */ struct timeval err_time; int err_pps; }; #define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p)) #define VXLAN_RUNLOCK(_sc, _p) rm_runlock(&(_sc)->vxl_lock, (_p)) #define VXLAN_WLOCK(_sc) rm_wlock(&(_sc)->vxl_lock) #define VXLAN_WUNLOCK(_sc) rm_wunlock(&(_sc)->vxl_lock) #define VXLAN_LOCK_WOWNED(_sc) rm_wowned(&(_sc)->vxl_lock) #define VXLAN_LOCK_ASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_LOCKED) #define VXLAN_LOCK_WASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_WLOCKED) #define VXLAN_UNLOCK(_sc, _p) do { \ if (VXLAN_LOCK_WOWNED(_sc)) \ VXLAN_WUNLOCK(_sc); \ else \ VXLAN_RUNLOCK(_sc, _p); \ } while (0) #define VXLAN_ACQUIRE(_sc) refcount_acquire(&(_sc)->vxl_refcnt) #define VXLAN_RELEASE(_sc) refcount_release(&(_sc)->vxl_refcnt) #define satoconstsin(sa) ((const struct sockaddr_in *)(sa)) #define satoconstsin6(sa) ((const struct sockaddr_in6 *)(sa)) struct vxlanudphdr { struct udphdr vxlh_udp; struct vxlan_header vxlh_hdr; } __packed; static int vxlan_ftable_addr_cmp(const uint8_t *, const uint8_t *); static void vxlan_ftable_init(struct vxlan_softc *); static void vxlan_ftable_fini(struct vxlan_softc *); static void vxlan_ftable_flush(struct vxlan_softc *, int); static void vxlan_ftable_expire(struct vxlan_softc *); static int vxlan_ftable_update_locked(struct vxlan_softc *, const union vxlan_sockaddr *, const uint8_t *, struct rm_priotracker *); static int vxlan_ftable_learn(struct vxlan_softc *, const struct sockaddr *, const uint8_t *); static int vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS); static struct vxlan_ftable_entry * vxlan_ftable_entry_alloc(void); static void vxlan_ftable_entry_free(struct vxlan_ftable_entry *); static void vxlan_ftable_entry_init(struct vxlan_softc *, struct vxlan_ftable_entry *, const uint8_t *, const struct sockaddr *, uint32_t); static void vxlan_ftable_entry_destroy(struct vxlan_softc *, struct vxlan_ftable_entry *); static int vxlan_ftable_entry_insert(struct vxlan_softc *, struct vxlan_ftable_entry *); static struct vxlan_ftable_entry * vxlan_ftable_entry_lookup(struct vxlan_softc *, const uint8_t *); static void vxlan_ftable_entry_dump(struct vxlan_ftable_entry *, struct sbuf *); static struct vxlan_socket * vxlan_socket_alloc(const union vxlan_sockaddr *); static void vxlan_socket_destroy(struct vxlan_socket *); static void vxlan_socket_release(struct vxlan_socket *); static struct vxlan_socket * vxlan_socket_lookup(union vxlan_sockaddr *vxlsa); static void vxlan_socket_insert(struct vxlan_socket *); static int vxlan_socket_init(struct vxlan_socket *, struct ifnet *); static int vxlan_socket_bind(struct vxlan_socket *, struct ifnet *); static int vxlan_socket_create(struct ifnet *, int, const union vxlan_sockaddr *, struct vxlan_socket **); static void vxlan_socket_ifdetach(struct vxlan_socket *, struct ifnet *, struct vxlan_softc_head *); static struct vxlan_socket * vxlan_socket_mc_lookup(const union vxlan_sockaddr *); static int vxlan_sockaddr_mc_info_match( const struct vxlan_socket_mc_info *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int); static int vxlan_socket_mc_join_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int *, union vxlan_sockaddr *); static int vxlan_socket_mc_leave_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int); static int vxlan_socket_mc_add_group(struct vxlan_socket *, const union vxlan_sockaddr *, const union vxlan_sockaddr *, int, int *); static void vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *, int); static struct vxlan_softc * vxlan_socket_lookup_softc_locked(struct vxlan_socket *, uint32_t); static struct vxlan_softc * vxlan_socket_lookup_softc(struct vxlan_socket *, uint32_t); static int vxlan_socket_insert_softc(struct vxlan_socket *, struct vxlan_softc *); static void vxlan_socket_remove_softc(struct vxlan_socket *, struct vxlan_softc *); static struct ifnet * vxlan_multicast_if_ref(struct vxlan_softc *, int); static void vxlan_free_multicast(struct vxlan_softc *); static int vxlan_setup_multicast_interface(struct vxlan_softc *); static int vxlan_setup_multicast(struct vxlan_softc *); static int vxlan_setup_socket(struct vxlan_softc *); #ifdef INET6 static void vxlan_setup_zero_checksum_port(struct vxlan_softc *); #endif static void vxlan_setup_interface_hdrlen(struct vxlan_softc *); static int vxlan_valid_init_config(struct vxlan_softc *); static void vxlan_init_wait(struct vxlan_softc *); static void vxlan_init_complete(struct vxlan_softc *); static void vxlan_init(void *); static void vxlan_release(struct vxlan_softc *); static void vxlan_teardown_wait(struct vxlan_softc *); static void vxlan_teardown_complete(struct vxlan_softc *); static void vxlan_teardown_locked(struct vxlan_softc *); static void vxlan_teardown(struct vxlan_softc *); static void vxlan_ifdetach(struct vxlan_softc *, struct ifnet *, struct vxlan_softc_head *); static void vxlan_timer(void *); static int vxlan_ctrl_get_config(struct vxlan_softc *, void *); static int vxlan_ctrl_set_vni(struct vxlan_softc *, void *); static int vxlan_ctrl_set_local_addr(struct vxlan_softc *, void *); static int vxlan_ctrl_set_remote_addr(struct vxlan_softc *, void *); static int vxlan_ctrl_set_local_port(struct vxlan_softc *, void *); static int vxlan_ctrl_set_remote_port(struct vxlan_softc *, void *); static int vxlan_ctrl_set_port_range(struct vxlan_softc *, void *); static int vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *, void *); static int vxlan_ctrl_set_ftable_max(struct vxlan_softc *, void *); static int vxlan_ctrl_set_multicast_if(struct vxlan_softc * , void *); static int vxlan_ctrl_set_ttl(struct vxlan_softc *, void *); static int vxlan_ctrl_set_learn(struct vxlan_softc *, void *); static int vxlan_ctrl_ftable_entry_add(struct vxlan_softc *, void *); static int vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *, void *); static int vxlan_ctrl_flush(struct vxlan_softc *, void *); static int vxlan_ioctl_drvspec(struct vxlan_softc *, struct ifdrv *, int); static int vxlan_ioctl_ifflags(struct vxlan_softc *); static int vxlan_ioctl(struct ifnet *, u_long, caddr_t); #if defined(INET) || defined(INET6) static uint16_t vxlan_pick_source_port(struct vxlan_softc *, struct mbuf *); static void vxlan_encap_header(struct vxlan_softc *, struct mbuf *, int, uint16_t, uint16_t); #endif static int vxlan_encap4(struct vxlan_softc *, const union vxlan_sockaddr *, struct mbuf *); static int vxlan_encap6(struct vxlan_softc *, const union vxlan_sockaddr *, struct mbuf *); static int vxlan_transmit(struct ifnet *, struct mbuf *); static void vxlan_qflush(struct ifnet *); static bool vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **, const struct sockaddr *); static int vxlan_stats_alloc(struct vxlan_softc *); static void vxlan_stats_free(struct vxlan_softc *); static void vxlan_set_default_config(struct vxlan_softc *); static int vxlan_set_user_config(struct vxlan_softc *, struct ifvxlanparam *); static int vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int); static void vxlan_set_hwcaps(struct vxlan_softc *); -static int vxlan_clone_create(struct if_clone *, int, caddr_t); -static void vxlan_clone_destroy(struct ifnet *); +static int vxlan_clone_create(struct if_clone *, char *, size_t, + struct ifc_data *, struct ifnet **); +static int vxlan_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static uint32_t vxlan_mac_hash(struct vxlan_softc *, const uint8_t *); static int vxlan_media_change(struct ifnet *); static void vxlan_media_status(struct ifnet *, struct ifmediareq *); static int vxlan_sockaddr_cmp(const union vxlan_sockaddr *, const struct sockaddr *); static void vxlan_sockaddr_copy(union vxlan_sockaddr *, const struct sockaddr *); static int vxlan_sockaddr_in_equal(const union vxlan_sockaddr *, const struct sockaddr *); static void vxlan_sockaddr_in_copy(union vxlan_sockaddr *, const struct sockaddr *); static int vxlan_sockaddr_supported(const union vxlan_sockaddr *, int); static int vxlan_sockaddr_in_any(const union vxlan_sockaddr *); static int vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *); static int vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *); static int vxlan_can_change_config(struct vxlan_softc *); static int vxlan_check_vni(uint32_t); static int vxlan_check_ttl(int); static int vxlan_check_ftable_timeout(uint32_t); static int vxlan_check_ftable_max(uint32_t); static void vxlan_sysctl_setup(struct vxlan_softc *); static void vxlan_sysctl_destroy(struct vxlan_softc *); static int vxlan_tunable_int(struct vxlan_softc *, const char *, int); static void vxlan_ifdetach_event(void *, struct ifnet *); static void vxlan_load(void); static void vxlan_unload(void); static int vxlan_modevent(module_t, int, void *); static const char vxlan_name[] = "vxlan"; static MALLOC_DEFINE(M_VXLAN, vxlan_name, "Virtual eXtensible LAN Interface"); static struct if_clone *vxlan_cloner; static struct mtx vxlan_list_mtx; #define VXLAN_LIST_LOCK() mtx_lock(&vxlan_list_mtx) #define VXLAN_LIST_UNLOCK() mtx_unlock(&vxlan_list_mtx) static LIST_HEAD(, vxlan_socket) vxlan_socket_list; static eventhandler_tag vxlan_ifdetach_event_tag; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, vxlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Virtual eXtensible Local Area Network"); static int vxlan_legacy_port = 0; TUNABLE_INT("net.link.vxlan.legacy_port", &vxlan_legacy_port); static int vxlan_reuse_port = 0; TUNABLE_INT("net.link.vxlan.reuse_port", &vxlan_reuse_port); /* Default maximum number of addresses in the forwarding table. */ #ifndef VXLAN_FTABLE_MAX #define VXLAN_FTABLE_MAX 2000 #endif /* Timeout (in seconds) of addresses learned in the forwarding table. */ #ifndef VXLAN_FTABLE_TIMEOUT #define VXLAN_FTABLE_TIMEOUT (20 * 60) #endif /* * Maximum timeout (in seconds) of addresses learned in the forwarding * table. */ #ifndef VXLAN_FTABLE_MAX_TIMEOUT #define VXLAN_FTABLE_MAX_TIMEOUT (60 * 60 * 24) #endif /* Number of seconds between pruning attempts of the forwarding table. */ #ifndef VXLAN_FTABLE_PRUNE #define VXLAN_FTABLE_PRUNE (5 * 60) #endif static int vxlan_ftable_prune_period = VXLAN_FTABLE_PRUNE; struct vxlan_control { int (*vxlc_func)(struct vxlan_softc *, void *); int vxlc_argsize; int vxlc_flags; #define VXLAN_CTRL_FLAG_COPYIN 0x01 #define VXLAN_CTRL_FLAG_COPYOUT 0x02 #define VXLAN_CTRL_FLAG_SUSER 0x04 }; static const struct vxlan_control vxlan_control_table[] = { [VXLAN_CMD_GET_CONFIG] = { vxlan_ctrl_get_config, sizeof(struct ifvxlancfg), VXLAN_CTRL_FLAG_COPYOUT }, [VXLAN_CMD_SET_VNI] = { vxlan_ctrl_set_vni, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LOCAL_ADDR] = { vxlan_ctrl_set_local_addr, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_REMOTE_ADDR] = { vxlan_ctrl_set_remote_addr, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LOCAL_PORT] = { vxlan_ctrl_set_local_port, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_REMOTE_PORT] = { vxlan_ctrl_set_remote_port, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_PORT_RANGE] = { vxlan_ctrl_set_port_range, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_FTABLE_TIMEOUT] = { vxlan_ctrl_set_ftable_timeout, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_FTABLE_MAX] = { vxlan_ctrl_set_ftable_max, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_MULTICAST_IF] = { vxlan_ctrl_set_multicast_if, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_TTL] = { vxlan_ctrl_set_ttl, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_SET_LEARN] = { vxlan_ctrl_set_learn, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FTABLE_ENTRY_ADD] = { vxlan_ctrl_ftable_entry_add, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FTABLE_ENTRY_REM] = { vxlan_ctrl_ftable_entry_rem, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, [VXLAN_CMD_FLUSH] = { vxlan_ctrl_flush, sizeof(struct ifvxlancmd), VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER, }, }; static const int vxlan_control_table_size = nitems(vxlan_control_table); static int vxlan_ftable_addr_cmp(const uint8_t *a, const uint8_t *b) { int i, d; for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) d = ((int)a[i]) - ((int)b[i]); return (d); } static void vxlan_ftable_init(struct vxlan_softc *sc) { int i; sc->vxl_ftable = malloc(sizeof(struct vxlan_ftable_head) * VXLAN_SC_FTABLE_SIZE, M_VXLAN, M_ZERO | M_WAITOK); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) LIST_INIT(&sc->vxl_ftable[i]); sc->vxl_ftable_hash_key = arc4random(); } static void vxlan_ftable_fini(struct vxlan_softc *sc) { int i; for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { KASSERT(LIST_EMPTY(&sc->vxl_ftable[i]), ("%s: vxlan %p ftable[%d] not empty", __func__, sc, i)); } MPASS(sc->vxl_ftable_cnt == 0); free(sc->vxl_ftable, M_VXLAN); sc->vxl_ftable = NULL; } static void vxlan_ftable_flush(struct vxlan_softc *sc, int all) { struct vxlan_ftable_entry *fe, *tfe; int i; for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) { if (all || VXLAN_FE_IS_DYNAMIC(fe)) vxlan_ftable_entry_destroy(sc, fe); } } } static void vxlan_ftable_expire(struct vxlan_softc *sc) { struct vxlan_ftable_entry *fe, *tfe; int i; VXLAN_LOCK_WASSERT(sc); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) { if (VXLAN_FE_IS_DYNAMIC(fe) && time_uptime >= fe->vxlfe_expire) vxlan_ftable_entry_destroy(sc, fe); } } } static int vxlan_ftable_update_locked(struct vxlan_softc *sc, const union vxlan_sockaddr *vxlsa, const uint8_t *mac, struct rm_priotracker *tracker) { struct vxlan_ftable_entry *fe; int error __unused; VXLAN_LOCK_ASSERT(sc); again: /* * A forwarding entry for this MAC address might already exist. If * so, update it, otherwise create a new one. We may have to upgrade * the lock if we have to change or create an entry. */ fe = vxlan_ftable_entry_lookup(sc, mac); if (fe != NULL) { fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout; if (!VXLAN_FE_IS_DYNAMIC(fe) || vxlan_sockaddr_in_equal(&fe->vxlfe_raddr, &vxlsa->sa)) return (0); if (!VXLAN_LOCK_WOWNED(sc)) { VXLAN_RUNLOCK(sc, tracker); VXLAN_WLOCK(sc); sc->vxl_stats.ftable_lock_upgrade_failed++; goto again; } vxlan_sockaddr_in_copy(&fe->vxlfe_raddr, &vxlsa->sa); return (0); } if (!VXLAN_LOCK_WOWNED(sc)) { VXLAN_RUNLOCK(sc, tracker); VXLAN_WLOCK(sc); sc->vxl_stats.ftable_lock_upgrade_failed++; goto again; } if (sc->vxl_ftable_cnt >= sc->vxl_ftable_max) { sc->vxl_stats.ftable_nospace++; return (ENOSPC); } fe = vxlan_ftable_entry_alloc(); if (fe == NULL) return (ENOMEM); vxlan_ftable_entry_init(sc, fe, mac, &vxlsa->sa, VXLAN_FE_FLAG_DYNAMIC); /* The prior lookup failed, so the insert should not. */ error = vxlan_ftable_entry_insert(sc, fe); MPASS(error == 0); return (0); } static int vxlan_ftable_learn(struct vxlan_softc *sc, const struct sockaddr *sa, const uint8_t *mac) { struct rm_priotracker tracker; union vxlan_sockaddr vxlsa; int error; /* * The source port may be randomly selected by the remote host, so * use the port of the default destination address. */ vxlan_sockaddr_copy(&vxlsa, sa); vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port; if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) { error = vxlan_sockaddr_in6_embedscope(&vxlsa); if (error) return (error); } VXLAN_RLOCK(sc, &tracker); error = vxlan_ftable_update_locked(sc, &vxlsa, mac, &tracker); VXLAN_UNLOCK(sc, &tracker); return (error); } static int vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sbuf sb; struct vxlan_softc *sc; struct vxlan_ftable_entry *fe; size_t size; int i, error; /* * This is mostly intended for debugging during development. It is * not practical to dump an entire large table this way. */ sc = arg1; size = PAGE_SIZE; /* Calculate later. */ sbuf_new(&sb, NULL, size, SBUF_FIXEDLEN); sbuf_putc(&sb, '\n'); VXLAN_RLOCK(sc, &tracker); for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) { LIST_FOREACH(fe, &sc->vxl_ftable[i], vxlfe_hash) { if (sbuf_error(&sb) != 0) break; vxlan_ftable_entry_dump(fe, &sb); } } VXLAN_RUNLOCK(sc, &tracker); if (sbuf_len(&sb) == 1) sbuf_setpos(&sb, 0); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (error); } static struct vxlan_ftable_entry * vxlan_ftable_entry_alloc(void) { struct vxlan_ftable_entry *fe; fe = malloc(sizeof(*fe), M_VXLAN, M_ZERO | M_NOWAIT); return (fe); } static void vxlan_ftable_entry_free(struct vxlan_ftable_entry *fe) { free(fe, M_VXLAN); } static void vxlan_ftable_entry_init(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe, const uint8_t *mac, const struct sockaddr *sa, uint32_t flags) { fe->vxlfe_flags = flags; fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout; memcpy(fe->vxlfe_mac, mac, ETHER_ADDR_LEN); vxlan_sockaddr_copy(&fe->vxlfe_raddr, sa); } static void vxlan_ftable_entry_destroy(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe) { sc->vxl_ftable_cnt--; LIST_REMOVE(fe, vxlfe_hash); vxlan_ftable_entry_free(fe); } static int vxlan_ftable_entry_insert(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe) { struct vxlan_ftable_entry *lfe; uint32_t hash; int dir; VXLAN_LOCK_WASSERT(sc); hash = VXLAN_SC_FTABLE_HASH(sc, fe->vxlfe_mac); lfe = LIST_FIRST(&sc->vxl_ftable[hash]); if (lfe == NULL) { LIST_INSERT_HEAD(&sc->vxl_ftable[hash], fe, vxlfe_hash); goto out; } do { dir = vxlan_ftable_addr_cmp(fe->vxlfe_mac, lfe->vxlfe_mac); if (dir == 0) return (EEXIST); if (dir > 0) { LIST_INSERT_BEFORE(lfe, fe, vxlfe_hash); goto out; } else if (LIST_NEXT(lfe, vxlfe_hash) == NULL) { LIST_INSERT_AFTER(lfe, fe, vxlfe_hash); goto out; } else lfe = LIST_NEXT(lfe, vxlfe_hash); } while (lfe != NULL); out: sc->vxl_ftable_cnt++; return (0); } static struct vxlan_ftable_entry * vxlan_ftable_entry_lookup(struct vxlan_softc *sc, const uint8_t *mac) { struct vxlan_ftable_entry *fe; uint32_t hash; int dir; VXLAN_LOCK_ASSERT(sc); hash = VXLAN_SC_FTABLE_HASH(sc, mac); LIST_FOREACH(fe, &sc->vxl_ftable[hash], vxlfe_hash) { dir = vxlan_ftable_addr_cmp(mac, fe->vxlfe_mac); if (dir == 0) return (fe); if (dir > 0) break; } return (NULL); } static void vxlan_ftable_entry_dump(struct vxlan_ftable_entry *fe, struct sbuf *sb) { char buf[64]; const union vxlan_sockaddr *sa; const void *addr; int i, len, af, width; sa = &fe->vxlfe_raddr; af = sa->sa.sa_family; len = sbuf_len(sb); sbuf_printf(sb, "%c 0x%02X ", VXLAN_FE_IS_DYNAMIC(fe) ? 'D' : 'S', fe->vxlfe_flags); for (i = 0; i < ETHER_ADDR_LEN - 1; i++) sbuf_printf(sb, "%02X:", fe->vxlfe_mac[i]); sbuf_printf(sb, "%02X ", fe->vxlfe_mac[i]); if (af == AF_INET) { addr = &sa->in4.sin_addr; width = INET_ADDRSTRLEN - 1; } else { addr = &sa->in6.sin6_addr; width = INET6_ADDRSTRLEN - 1; } inet_ntop(af, addr, buf, sizeof(buf)); sbuf_printf(sb, "%*s ", width, buf); sbuf_printf(sb, "%08jd", (intmax_t)fe->vxlfe_expire); sbuf_putc(sb, '\n'); /* Truncate a partial line. */ if (sbuf_error(sb) != 0) sbuf_setpos(sb, len); } static struct vxlan_socket * vxlan_socket_alloc(const union vxlan_sockaddr *sa) { struct vxlan_socket *vso; int i; vso = malloc(sizeof(*vso), M_VXLAN, M_WAITOK | M_ZERO); rm_init(&vso->vxlso_lock, "vxlansorm"); refcount_init(&vso->vxlso_refcnt, 0); for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) LIST_INIT(&vso->vxlso_vni_hash[i]); vso->vxlso_laddr = *sa; return (vso); } static void vxlan_socket_destroy(struct vxlan_socket *vso) { struct socket *so; #ifdef INVARIANTS int i; struct vxlan_socket_mc_info *mc; for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; KASSERT(mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC, ("%s: socket %p mc[%d] still has address", __func__, vso, i)); } for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) { KASSERT(LIST_EMPTY(&vso->vxlso_vni_hash[i]), ("%s: socket %p vni_hash[%d] not empty", __func__, vso, i)); } #endif so = vso->vxlso_sock; if (so != NULL) { vso->vxlso_sock = NULL; soclose(so); } rm_destroy(&vso->vxlso_lock); free(vso, M_VXLAN); } static void vxlan_socket_release(struct vxlan_socket *vso) { int destroy; VXLAN_LIST_LOCK(); destroy = VXLAN_SO_RELEASE(vso); if (destroy != 0) LIST_REMOVE(vso, vxlso_entry); VXLAN_LIST_UNLOCK(); if (destroy != 0) vxlan_socket_destroy(vso); } static struct vxlan_socket * vxlan_socket_lookup(union vxlan_sockaddr *vxlsa) { struct vxlan_socket *vso; VXLAN_LIST_LOCK(); LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) { if (vxlan_sockaddr_cmp(&vso->vxlso_laddr, &vxlsa->sa) == 0) { VXLAN_SO_ACQUIRE(vso); break; } } VXLAN_LIST_UNLOCK(); return (vso); } static void vxlan_socket_insert(struct vxlan_socket *vso) { VXLAN_LIST_LOCK(); VXLAN_SO_ACQUIRE(vso); LIST_INSERT_HEAD(&vxlan_socket_list, vso, vxlso_entry); VXLAN_LIST_UNLOCK(); } static int vxlan_socket_init(struct vxlan_socket *vso, struct ifnet *ifp) { struct thread *td; int error; td = curthread; error = socreate(vso->vxlso_laddr.sa.sa_family, &vso->vxlso_sock, SOCK_DGRAM, IPPROTO_UDP, td->td_ucred, td); if (error) { if_printf(ifp, "cannot create socket: %d\n", error); return (error); } error = udp_set_kernel_tunneling(vso->vxlso_sock, vxlan_rcv_udp_packet, NULL, vso); if (error) { if_printf(ifp, "cannot set tunneling function: %d\n", error); return (error); } if (vxlan_reuse_port != 0) { struct sockopt sopt; int val = 1; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = SO_REUSEPORT; sopt.sopt_val = &val; sopt.sopt_valsize = sizeof(val); error = sosetopt(vso->vxlso_sock, &sopt); if (error) { if_printf(ifp, "cannot set REUSEADDR socket opt: %d\n", error); return (error); } } return (0); } static int vxlan_socket_bind(struct vxlan_socket *vso, struct ifnet *ifp) { union vxlan_sockaddr laddr; struct thread *td; int error; td = curthread; laddr = vso->vxlso_laddr; error = sobind(vso->vxlso_sock, &laddr.sa, td); if (error) { if (error != EADDRINUSE) if_printf(ifp, "cannot bind socket: %d\n", error); return (error); } return (0); } static int vxlan_socket_create(struct ifnet *ifp, int multicast, const union vxlan_sockaddr *saddr, struct vxlan_socket **vsop) { union vxlan_sockaddr laddr; struct vxlan_socket *vso; int error; laddr = *saddr; /* * If this socket will be multicast, then only the local port * must be specified when binding. */ if (multicast != 0) { if (VXLAN_SOCKADDR_IS_IPV4(&laddr)) laddr.in4.sin_addr.s_addr = INADDR_ANY; #ifdef INET6 else laddr.in6.sin6_addr = in6addr_any; #endif } vso = vxlan_socket_alloc(&laddr); if (vso == NULL) return (ENOMEM); error = vxlan_socket_init(vso, ifp); if (error) goto fail; error = vxlan_socket_bind(vso, ifp); if (error) goto fail; /* * There is a small window between the bind completing and * inserting the socket, so that a concurrent create may fail. * Let's not worry about that for now. */ vxlan_socket_insert(vso); *vsop = vso; return (0); fail: vxlan_socket_destroy(vso); return (error); } static void vxlan_socket_ifdetach(struct vxlan_socket *vso, struct ifnet *ifp, struct vxlan_softc_head *list) { struct rm_priotracker tracker; struct vxlan_softc *sc; int i; VXLAN_SO_RLOCK(vso, &tracker); for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) { LIST_FOREACH(sc, &vso->vxlso_vni_hash[i], vxl_entry) vxlan_ifdetach(sc, ifp, list); } VXLAN_SO_RUNLOCK(vso, &tracker); } static struct vxlan_socket * vxlan_socket_mc_lookup(const union vxlan_sockaddr *vxlsa) { union vxlan_sockaddr laddr; struct vxlan_socket *vso; laddr = *vxlsa; if (VXLAN_SOCKADDR_IS_IPV4(&laddr)) laddr.in4.sin_addr.s_addr = INADDR_ANY; #ifdef INET6 else laddr.in6.sin6_addr = in6addr_any; #endif vso = vxlan_socket_lookup(&laddr); return (vso); } static int vxlan_sockaddr_mc_info_match(const struct vxlan_socket_mc_info *mc, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int ifidx) { if (!vxlan_sockaddr_in_any(local) && !vxlan_sockaddr_in_equal(&mc->vxlsomc_saddr, &local->sa)) return (0); if (!vxlan_sockaddr_in_equal(&mc->vxlsomc_gaddr, &group->sa)) return (0); if (ifidx != 0 && ifidx != mc->vxlsomc_ifidx) return (0); return (1); } static int vxlan_socket_mc_join_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int *ifidx, union vxlan_sockaddr *source) { struct sockopt sopt; int error; *source = *local; if (VXLAN_SOCKADDR_IS_IPV4(group)) { struct ip_mreq mreq; mreq.imr_multiaddr = group->in4.sin_addr; mreq.imr_interface = local->in4.sin_addr; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_ADD_MEMBERSHIP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); if (error) return (error); /* * BMV: Ideally, there would be a formal way for us to get * the local interface that was selected based on the * imr_interface address. We could then update *ifidx so * vxlan_sockaddr_mc_info_match() would return a match for * later creates that explicitly set the multicast interface. * * If we really need to, we can of course look in the INP's * membership list: * sotoinpcb(vso->vxlso_sock)->inp_moptions-> * imo_head[]->imf_inm->inm_ifp * similarly to imo_match_group(). */ source->in4.sin_addr = local->in4.sin_addr; } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { struct ipv6_mreq mreq; mreq.ipv6mr_multiaddr = group->in6.sin6_addr; mreq.ipv6mr_interface = *ifidx; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_JOIN_GROUP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); if (error) return (error); /* * BMV: As with IPv4, we would really like to know what * interface in6p_lookup_mcast_ifp() selected. */ } else error = EAFNOSUPPORT; return (error); } static int vxlan_socket_mc_leave_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *source, int ifidx) { struct sockopt sopt; int error; bzero(&sopt, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; if (VXLAN_SOCKADDR_IS_IPV4(group)) { struct ip_mreq mreq; mreq.imr_multiaddr = group->in4.sin_addr; mreq.imr_interface = source->in4.sin_addr; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_DROP_MEMBERSHIP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { struct ipv6_mreq mreq; mreq.ipv6mr_multiaddr = group->in6.sin6_addr; mreq.ipv6mr_interface = ifidx; sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_LEAVE_GROUP; sopt.sopt_val = &mreq; sopt.sopt_valsize = sizeof(mreq); error = sosetopt(vso->vxlso_sock, &sopt); } else error = EAFNOSUPPORT; return (error); } static int vxlan_socket_mc_add_group(struct vxlan_socket *vso, const union vxlan_sockaddr *group, const union vxlan_sockaddr *local, int ifidx, int *idx) { union vxlan_sockaddr source; struct vxlan_socket_mc_info *mc; int i, empty, error; /* * Within a socket, the same multicast group may be used by multiple * interfaces, each with a different network identifier. But a socket * may only join a multicast group once, so keep track of the users * here. */ VXLAN_SO_WLOCK(vso); for (empty = 0, i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) { empty++; continue; } if (vxlan_sockaddr_mc_info_match(mc, group, local, ifidx)) goto out; } VXLAN_SO_WUNLOCK(vso); if (empty == 0) return (ENOSPC); error = vxlan_socket_mc_join_group(vso, group, local, &ifidx, &source); if (error) return (error); VXLAN_SO_WLOCK(vso); for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) { mc = &vso->vxlso_mc[i]; if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) { vxlan_sockaddr_copy(&mc->vxlsomc_gaddr, &group->sa); vxlan_sockaddr_copy(&mc->vxlsomc_saddr, &source.sa); mc->vxlsomc_ifidx = ifidx; goto out; } } VXLAN_SO_WUNLOCK(vso); error = vxlan_socket_mc_leave_group(vso, group, &source, ifidx); MPASS(error == 0); return (ENOSPC); out: mc->vxlsomc_users++; VXLAN_SO_WUNLOCK(vso); *idx = i; return (0); } static void vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *vso, int idx) { union vxlan_sockaddr group, source; struct vxlan_socket_mc_info *mc; int ifidx, leave; KASSERT(idx >= 0 && idx < VXLAN_SO_MC_MAX_GROUPS, ("%s: vso %p idx %d out of bounds", __func__, vso, idx)); leave = 0; mc = &vso->vxlso_mc[idx]; VXLAN_SO_WLOCK(vso); mc->vxlsomc_users--; if (mc->vxlsomc_users == 0) { group = mc->vxlsomc_gaddr; source = mc->vxlsomc_saddr; ifidx = mc->vxlsomc_ifidx; bzero(mc, sizeof(*mc)); leave = 1; } VXLAN_SO_WUNLOCK(vso); if (leave != 0) { /* * Our socket's membership in this group may have already * been removed if we joined through an interface that's * been detached. */ vxlan_socket_mc_leave_group(vso, &group, &source, ifidx); } } static struct vxlan_softc * vxlan_socket_lookup_softc_locked(struct vxlan_socket *vso, uint32_t vni) { struct vxlan_softc *sc; uint32_t hash; VXLAN_SO_LOCK_ASSERT(vso); hash = VXLAN_SO_VNI_HASH(vni); LIST_FOREACH(sc, &vso->vxlso_vni_hash[hash], vxl_entry) { if (sc->vxl_vni == vni) { VXLAN_ACQUIRE(sc); break; } } return (sc); } static struct vxlan_softc * vxlan_socket_lookup_softc(struct vxlan_socket *vso, uint32_t vni) { struct rm_priotracker tracker; struct vxlan_softc *sc; VXLAN_SO_RLOCK(vso, &tracker); sc = vxlan_socket_lookup_softc_locked(vso, vni); VXLAN_SO_RUNLOCK(vso, &tracker); return (sc); } static int vxlan_socket_insert_softc(struct vxlan_socket *vso, struct vxlan_softc *sc) { struct vxlan_softc *tsc; uint32_t vni, hash; vni = sc->vxl_vni; hash = VXLAN_SO_VNI_HASH(vni); VXLAN_SO_WLOCK(vso); tsc = vxlan_socket_lookup_softc_locked(vso, vni); if (tsc != NULL) { VXLAN_SO_WUNLOCK(vso); vxlan_release(tsc); return (EEXIST); } VXLAN_ACQUIRE(sc); LIST_INSERT_HEAD(&vso->vxlso_vni_hash[hash], sc, vxl_entry); VXLAN_SO_WUNLOCK(vso); return (0); } static void vxlan_socket_remove_softc(struct vxlan_socket *vso, struct vxlan_softc *sc) { VXLAN_SO_WLOCK(vso); LIST_REMOVE(sc, vxl_entry); VXLAN_SO_WUNLOCK(vso); vxlan_release(sc); } static struct ifnet * vxlan_multicast_if_ref(struct vxlan_softc *sc, int ipv4) { struct ifnet *ifp; VXLAN_LOCK_ASSERT(sc); if (ipv4 && sc->vxl_im4o != NULL) ifp = sc->vxl_im4o->imo_multicast_ifp; else if (!ipv4 && sc->vxl_im6o != NULL) ifp = sc->vxl_im6o->im6o_multicast_ifp; else ifp = NULL; if (ifp != NULL) if_ref(ifp); return (ifp); } static void vxlan_free_multicast(struct vxlan_softc *sc) { if (sc->vxl_mc_ifp != NULL) { if_rele(sc->vxl_mc_ifp); sc->vxl_mc_ifp = NULL; sc->vxl_mc_ifindex = 0; } if (sc->vxl_im4o != NULL) { free(sc->vxl_im4o, M_VXLAN); sc->vxl_im4o = NULL; } if (sc->vxl_im6o != NULL) { free(sc->vxl_im6o, M_VXLAN); sc->vxl_im6o = NULL; } } static int vxlan_setup_multicast_interface(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = ifunit_ref(sc->vxl_mc_ifname); if (ifp == NULL) { if_printf(sc->vxl_ifp, "multicast interface %s does " "not exist\n", sc->vxl_mc_ifname); return (ENOENT); } if ((ifp->if_flags & IFF_MULTICAST) == 0) { if_printf(sc->vxl_ifp, "interface %s does not support " "multicast\n", sc->vxl_mc_ifname); if_rele(ifp); return (ENOTSUP); } sc->vxl_mc_ifp = ifp; sc->vxl_mc_ifindex = ifp->if_index; return (0); } static int vxlan_setup_multicast(struct vxlan_softc *sc) { const union vxlan_sockaddr *group; int error; group = &sc->vxl_dst_addr; error = 0; if (sc->vxl_mc_ifname[0] != '\0') { error = vxlan_setup_multicast_interface(sc); if (error) return (error); } /* * Initialize an multicast options structure that is sufficiently * populated for use in the respective IP output routine. This * structure is typically stored in the socket, but our sockets * may be shared among multiple interfaces. */ if (VXLAN_SOCKADDR_IS_IPV4(group)) { sc->vxl_im4o = malloc(sizeof(struct ip_moptions), M_VXLAN, M_ZERO | M_WAITOK); sc->vxl_im4o->imo_multicast_ifp = sc->vxl_mc_ifp; sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl; sc->vxl_im4o->imo_multicast_vif = -1; } else if (VXLAN_SOCKADDR_IS_IPV6(group)) { sc->vxl_im6o = malloc(sizeof(struct ip6_moptions), M_VXLAN, M_ZERO | M_WAITOK); sc->vxl_im6o->im6o_multicast_ifp = sc->vxl_mc_ifp; sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl; } return (error); } static int vxlan_setup_socket(struct vxlan_softc *sc) { struct vxlan_socket *vso; struct ifnet *ifp; union vxlan_sockaddr *saddr, *daddr; int multicast, error; vso = NULL; ifp = sc->vxl_ifp; saddr = &sc->vxl_src_addr; daddr = &sc->vxl_dst_addr; multicast = vxlan_sockaddr_in_multicast(daddr); MPASS(multicast != -1); sc->vxl_vso_mc_index = -1; /* * Try to create the socket. If that fails, attempt to use an * existing socket. */ error = vxlan_socket_create(ifp, multicast, saddr, &vso); if (error) { if (multicast != 0) vso = vxlan_socket_mc_lookup(saddr); else vso = vxlan_socket_lookup(saddr); if (vso == NULL) { if_printf(ifp, "cannot create socket (error: %d), " "and no existing socket found\n", error); goto out; } } if (multicast != 0) { error = vxlan_setup_multicast(sc); if (error) goto out; error = vxlan_socket_mc_add_group(vso, daddr, saddr, sc->vxl_mc_ifindex, &sc->vxl_vso_mc_index); if (error) goto out; } sc->vxl_sock = vso; error = vxlan_socket_insert_softc(vso, sc); if (error) { sc->vxl_sock = NULL; if_printf(ifp, "network identifier %d already exists in " "this socket\n", sc->vxl_vni); goto out; } return (0); out: if (vso != NULL) { if (sc->vxl_vso_mc_index != -1) { vxlan_socket_mc_release_group_by_idx(vso, sc->vxl_vso_mc_index); sc->vxl_vso_mc_index = -1; } if (multicast != 0) vxlan_free_multicast(sc); vxlan_socket_release(vso); } return (error); } #ifdef INET6 static void vxlan_setup_zero_checksum_port(struct vxlan_softc *sc) { if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr)) return; MPASS(sc->vxl_src_addr.in6.sin6_port != 0); MPASS(sc->vxl_dst_addr.in6.sin6_port != 0); if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) { if_printf(sc->vxl_ifp, "port %d in src address does not match " "port %d in dst address, rfc6935_port (%d) not updated.\n", ntohs(sc->vxl_src_addr.in6.sin6_port), ntohs(sc->vxl_dst_addr.in6.sin6_port), V_zero_checksum_port); return; } if (V_zero_checksum_port != 0) { if (V_zero_checksum_port != ntohs(sc->vxl_src_addr.in6.sin6_port)) { if_printf(sc->vxl_ifp, "rfc6935_port is already set to " "%d, cannot set it to %d.\n", V_zero_checksum_port, ntohs(sc->vxl_src_addr.in6.sin6_port)); } return; } V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port); if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n", V_zero_checksum_port); } #endif static void vxlan_setup_interface_hdrlen(struct vxlan_softc *sc) { struct ifnet *ifp; VXLAN_LOCK_WASSERT(sc); ifp = sc->vxl_ifp; ifp->if_hdrlen = ETHER_HDR_LEN + sizeof(struct vxlanudphdr); if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr) != 0) ifp->if_hdrlen += sizeof(struct ip); else if (VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_dst_addr) != 0) ifp->if_hdrlen += sizeof(struct ip6_hdr); if ((sc->vxl_flags & VXLAN_FLAG_USER_MTU) == 0) ifp->if_mtu = ETHERMTU - ifp->if_hdrlen; } static int vxlan_valid_init_config(struct vxlan_softc *sc) { const char *reason; if (vxlan_check_vni(sc->vxl_vni) != 0) { reason = "invalid virtual network identifier specified"; goto fail; } if (vxlan_sockaddr_supported(&sc->vxl_src_addr, 1) == 0) { reason = "source address type is not supported"; goto fail; } if (vxlan_sockaddr_supported(&sc->vxl_dst_addr, 0) == 0) { reason = "destination address type is not supported"; goto fail; } if (vxlan_sockaddr_in_any(&sc->vxl_dst_addr) != 0) { reason = "no valid destination address specified"; goto fail; } if (vxlan_sockaddr_in_multicast(&sc->vxl_dst_addr) == 0 && sc->vxl_mc_ifname[0] != '\0') { reason = "can only specify interface with a group address"; goto fail; } if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) { if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_src_addr) ^ VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr)) { reason = "source and destination address must both " "be either IPv4 or IPv6"; goto fail; } } if (sc->vxl_src_addr.in4.sin_port == 0) { reason = "local port not specified"; goto fail; } if (sc->vxl_dst_addr.in4.sin_port == 0) { reason = "remote port not specified"; goto fail; } return (0); fail: if_printf(sc->vxl_ifp, "cannot initialize interface: %s\n", reason); return (EINVAL); } static void vxlan_init_wait(struct vxlan_softc *sc) { VXLAN_LOCK_WASSERT(sc); while (sc->vxl_flags & VXLAN_FLAG_INIT) rm_sleep(sc, &sc->vxl_lock, 0, "vxlint", hz); } static void vxlan_init_complete(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); sc->vxl_flags &= ~VXLAN_FLAG_INIT; wakeup(sc); VXLAN_WUNLOCK(sc); } static void vxlan_init(void *xsc) { static const uint8_t empty_mac[ETHER_ADDR_LEN]; struct vxlan_softc *sc; struct ifnet *ifp; sc = xsc; ifp = sc->vxl_ifp; sx_xlock(&vxlan_sx); VXLAN_WLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { VXLAN_WUNLOCK(sc); sx_xunlock(&vxlan_sx); return; } sc->vxl_flags |= VXLAN_FLAG_INIT; VXLAN_WUNLOCK(sc); if (vxlan_valid_init_config(sc) != 0) goto out; if (vxlan_setup_socket(sc) != 0) goto out; #ifdef INET6 vxlan_setup_zero_checksum_port(sc); #endif /* Initialize the default forwarding entry. */ vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac, &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC); VXLAN_WLOCK(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; callout_reset(&sc->vxl_callout, vxlan_ftable_prune_period * hz, vxlan_timer, sc); VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_UP); EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family, ntohs(sc->vxl_src_addr.in4.sin_port)); out: vxlan_init_complete(sc); sx_xunlock(&vxlan_sx); } static void vxlan_release(struct vxlan_softc *sc) { /* * The softc may be destroyed as soon as we release our reference, * so we cannot serialize the wakeup with the softc lock. We use a * timeout in our sleeps so a missed wakeup is unfortunate but not * fatal. */ if (VXLAN_RELEASE(sc) != 0) wakeup(sc); } static void vxlan_teardown_wait(struct vxlan_softc *sc) { VXLAN_LOCK_WASSERT(sc); while (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) rm_sleep(sc, &sc->vxl_lock, 0, "vxltrn", hz); } static void vxlan_teardown_complete(struct vxlan_softc *sc) { VXLAN_WLOCK(sc); sc->vxl_flags &= ~VXLAN_FLAG_TEARDOWN; wakeup(sc); VXLAN_WUNLOCK(sc); } static void vxlan_teardown_locked(struct vxlan_softc *sc) { struct ifnet *ifp; struct vxlan_socket *vso; sx_assert(&vxlan_sx, SA_XLOCKED); VXLAN_LOCK_WASSERT(sc); MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN); ifp = sc->vxl_ifp; ifp->if_flags &= ~IFF_UP; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->vxl_callout); vso = sc->vxl_sock; sc->vxl_sock = NULL; VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_DOWN); EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family, ntohs(sc->vxl_src_addr.in4.sin_port)); if (vso != NULL) { vxlan_socket_remove_softc(vso, sc); if (sc->vxl_vso_mc_index != -1) { vxlan_socket_mc_release_group_by_idx(vso, sc->vxl_vso_mc_index); sc->vxl_vso_mc_index = -1; } } VXLAN_WLOCK(sc); while (sc->vxl_refcnt != 0) rm_sleep(sc, &sc->vxl_lock, 0, "vxldrn", hz); VXLAN_WUNLOCK(sc); callout_drain(&sc->vxl_callout); vxlan_free_multicast(sc); if (vso != NULL) vxlan_socket_release(vso); vxlan_teardown_complete(sc); } static void vxlan_teardown(struct vxlan_softc *sc) { sx_xlock(&vxlan_sx); VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) { vxlan_teardown_wait(sc); VXLAN_WUNLOCK(sc); sx_xunlock(&vxlan_sx); return; } sc->vxl_flags |= VXLAN_FLAG_TEARDOWN; vxlan_teardown_locked(sc); sx_xunlock(&vxlan_sx); } static void vxlan_ifdetach(struct vxlan_softc *sc, struct ifnet *ifp, struct vxlan_softc_head *list) { VXLAN_WLOCK(sc); if (sc->vxl_mc_ifp != ifp) goto out; if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) goto out; sc->vxl_flags |= VXLAN_FLAG_TEARDOWN; LIST_INSERT_HEAD(list, sc, vxl_ifdetach_list); out: VXLAN_WUNLOCK(sc); } static void vxlan_timer(void *xsc) { struct vxlan_softc *sc; sc = xsc; VXLAN_LOCK_WASSERT(sc); vxlan_ftable_expire(sc); callout_schedule(&sc->vxl_callout, vxlan_ftable_prune_period * hz); } static int vxlan_ioctl_ifflags(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) vxlan_init(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) vxlan_teardown(sc); } return (0); } static int vxlan_ctrl_get_config(struct vxlan_softc *sc, void *arg) { struct rm_priotracker tracker; struct ifvxlancfg *cfg; cfg = arg; bzero(cfg, sizeof(*cfg)); VXLAN_RLOCK(sc, &tracker); cfg->vxlc_vni = sc->vxl_vni; memcpy(&cfg->vxlc_local_sa, &sc->vxl_src_addr, sizeof(union vxlan_sockaddr)); memcpy(&cfg->vxlc_remote_sa, &sc->vxl_dst_addr, sizeof(union vxlan_sockaddr)); cfg->vxlc_mc_ifindex = sc->vxl_mc_ifindex; cfg->vxlc_ftable_cnt = sc->vxl_ftable_cnt; cfg->vxlc_ftable_max = sc->vxl_ftable_max; cfg->vxlc_ftable_timeout = sc->vxl_ftable_timeout; cfg->vxlc_port_min = sc->vxl_min_port; cfg->vxlc_port_max = sc->vxl_max_port; cfg->vxlc_learn = (sc->vxl_flags & VXLAN_FLAG_LEARN) != 0; cfg->vxlc_ttl = sc->vxl_ttl; VXLAN_RUNLOCK(sc, &tracker); #ifdef INET6 if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_local_sa)) sa6_recoverscope(&cfg->vxlc_local_sa.in6); if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_remote_sa)) sa6_recoverscope(&cfg->vxlc_remote_sa.in6); #endif return (0); } static int vxlan_ctrl_set_vni(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (vxlan_check_vni(cmd->vxlcmd_vni) != 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_vni = cmd->vxlcmd_vni; error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; union vxlan_sockaddr *vxlsa; int error; cmd = arg; vxlsa = &cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa)) return (EINVAL); if (vxlan_sockaddr_in_multicast(vxlsa) != 0) return (EINVAL); if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) { error = vxlan_sockaddr_in6_embedscope(vxlsa); if (error) return (error); } VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa); vxlan_set_hwcaps(sc); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; union vxlan_sockaddr *vxlsa; int error; cmd = arg; vxlsa = &cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa)) return (EINVAL); if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) { error = vxlan_sockaddr_in6_embedscope(vxlsa); if (error) return (error); } VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa); vxlan_setup_interface_hdrlen(sc); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_local_port(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (cmd->vxlcmd_port == 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_src_addr.in4.sin_port = htons(cmd->vxlcmd_port); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_remote_port(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; if (cmd->vxlcmd_port == 0) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_dst_addr.in4.sin_port = htons(cmd->vxlcmd_port); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_port_range(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; uint16_t min, max; int error; cmd = arg; min = cmd->vxlcmd_port_min; max = cmd->vxlcmd_port_max; if (max < min) return (EINVAL); VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { sc->vxl_min_port = min; sc->vxl_max_port = max; error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ftable_timeout(cmd->vxlcmd_ftable_timeout) == 0) { sc->vxl_ftable_timeout = cmd->vxlcmd_ftable_timeout; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ftable_max(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ftable_max(cmd->vxlcmd_ftable_max) == 0) { sc->vxl_ftable_max = cmd->vxlcmd_ftable_max; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ); vxlan_set_hwcaps(sc); error = 0; } else error = EBUSY; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_ttl(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int error; cmd = arg; VXLAN_WLOCK(sc); if (vxlan_check_ttl(cmd->vxlcmd_ttl) == 0) { sc->vxl_ttl = cmd->vxlcmd_ttl; if (sc->vxl_im4o != NULL) sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl; if (sc->vxl_im6o != NULL) sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl; error = 0; } else error = EINVAL; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_set_learn(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; cmd = arg; VXLAN_WLOCK(sc); if (cmd->vxlcmd_flags & VXLAN_CMD_FLAG_LEARN) sc->vxl_flags |= VXLAN_FLAG_LEARN; else sc->vxl_flags &= ~VXLAN_FLAG_LEARN; VXLAN_WUNLOCK(sc); return (0); } static int vxlan_ctrl_ftable_entry_add(struct vxlan_softc *sc, void *arg) { union vxlan_sockaddr vxlsa; struct ifvxlancmd *cmd; struct vxlan_ftable_entry *fe; int error; cmd = arg; vxlsa = cmd->vxlcmd_sa; if (!VXLAN_SOCKADDR_IS_IPV46(&vxlsa)) return (EINVAL); if (vxlan_sockaddr_in_any(&vxlsa) != 0) return (EINVAL); if (vxlan_sockaddr_in_multicast(&vxlsa) != 0) return (EINVAL); /* BMV: We could support both IPv4 and IPv6 later. */ if (vxlsa.sa.sa_family != sc->vxl_dst_addr.sa.sa_family) return (EAFNOSUPPORT); if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) { error = vxlan_sockaddr_in6_embedscope(&vxlsa); if (error) return (error); } fe = vxlan_ftable_entry_alloc(); if (fe == NULL) return (ENOMEM); if (vxlsa.in4.sin_port == 0) vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port; vxlan_ftable_entry_init(sc, fe, cmd->vxlcmd_mac, &vxlsa.sa, VXLAN_FE_FLAG_STATIC); VXLAN_WLOCK(sc); error = vxlan_ftable_entry_insert(sc, fe); VXLAN_WUNLOCK(sc); if (error) vxlan_ftable_entry_free(fe); return (error); } static int vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; struct vxlan_ftable_entry *fe; int error; cmd = arg; VXLAN_WLOCK(sc); fe = vxlan_ftable_entry_lookup(sc, cmd->vxlcmd_mac); if (fe != NULL) { vxlan_ftable_entry_destroy(sc, fe); error = 0; } else error = ENOENT; VXLAN_WUNLOCK(sc); return (error); } static int vxlan_ctrl_flush(struct vxlan_softc *sc, void *arg) { struct ifvxlancmd *cmd; int all; cmd = arg; all = cmd->vxlcmd_flags & VXLAN_CMD_FLAG_FLUSH_ALL; VXLAN_WLOCK(sc); vxlan_ftable_flush(sc, all); VXLAN_WUNLOCK(sc); return (0); } static int vxlan_ioctl_drvspec(struct vxlan_softc *sc, struct ifdrv *ifd, int get) { const struct vxlan_control *vc; union { struct ifvxlancfg cfg; struct ifvxlancmd cmd; } args; int out, error; if (ifd->ifd_cmd >= vxlan_control_table_size) return (EINVAL); bzero(&args, sizeof(args)); vc = &vxlan_control_table[ifd->ifd_cmd]; out = (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) != 0; if ((get != 0 && out == 0) || (get == 0 && out != 0)) return (EINVAL); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_SUSER) { error = priv_check(curthread, PRIV_NET_VXLAN); if (error) return (error); } if (ifd->ifd_len != vc->vxlc_argsize || ifd->ifd_len > sizeof(args)) return (EINVAL); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYIN) { error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) return (error); } error = vc->vxlc_func(sc, &args); if (error) return (error); if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) { error = copyout(&args, ifd->ifd_data, ifd->ifd_len); if (error) return (error); } return (0); } static int vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct rm_priotracker tracker; struct vxlan_softc *sc; struct ifreq *ifr; struct ifdrv *ifd; int error; sc = ifp->if_softc; ifr = (struct ifreq *) data; ifd = (struct ifdrv *) data; error = 0; switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGDRVSPEC: case SIOCSDRVSPEC: error = vxlan_ioctl_drvspec(sc, ifd, cmd == SIOCGDRVSPEC); break; case SIOCSIFFLAGS: error = vxlan_ioctl_ifflags(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->vxl_media, cmd); break; case SIOCSIFMTU: if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > VXLAN_MAX_MTU) { error = EINVAL; } else { VXLAN_WLOCK(sc); ifp->if_mtu = ifr->ifr_mtu; sc->vxl_flags |= VXLAN_FLAG_USER_MTU; VXLAN_WUNLOCK(sc); } break; case SIOCSIFCAP: VXLAN_WLOCK(sc); error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap); if (error == 0) vxlan_set_hwcaps(sc); VXLAN_WUNLOCK(sc); break; case SIOCGTUNFIB: VXLAN_RLOCK(sc, &tracker); ifr->ifr_fib = sc->vxl_fibnum; VXLAN_RUNLOCK(sc, &tracker); break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_VXLAN)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else { VXLAN_WLOCK(sc); sc->vxl_fibnum = ifr->ifr_fib; VXLAN_WUNLOCK(sc); } break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #if defined(INET) || defined(INET6) static uint16_t vxlan_pick_source_port(struct vxlan_softc *sc, struct mbuf *m) { int range; uint32_t hash; range = sc->vxl_max_port - sc->vxl_min_port + 1; if (M_HASHTYPE_ISHASH(m)) hash = m->m_pkthdr.flowid; else hash = jenkins_hash(m->m_data, ETHER_HDR_LEN, sc->vxl_port_hash_key); return (sc->vxl_min_port + (hash % range)); } static void vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff, uint16_t srcport, uint16_t dstport) { struct vxlanudphdr *hdr; struct udphdr *udph; struct vxlan_header *vxh; int len; len = m->m_pkthdr.len - ipoff; MPASS(len >= sizeof(struct vxlanudphdr)); hdr = mtodo(m, ipoff); udph = &hdr->vxlh_udp; udph->uh_sport = srcport; udph->uh_dport = dstport; udph->uh_ulen = htons(len); udph->uh_sum = 0; vxh = &hdr->vxlh_hdr; vxh->vxlh_flags = htonl(VXLAN_HDR_FLAGS_VALID_VNI); vxh->vxlh_vni = htonl(sc->vxl_vni << VXLAN_HDR_VNI_SHIFT); } #endif #if defined(INET6) || defined(INET) /* * Return the CSUM_INNER_* equivalent of CSUM_* caps. */ static uint32_t csum_flags_to_inner_flags(uint32_t csum_flags_in, const uint32_t encap) { uint32_t csum_flags = encap; const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP; /* * csum_flags can request either v4 or v6 offload but not both. * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO) * so those bits are no good to detect the IP version. Other bits are * always set with CSUM_TSO and we use those to figure out the IP * version. */ if (csum_flags_in & v4) { if (csum_flags_in & CSUM_IP) csum_flags |= CSUM_INNER_IP; if (csum_flags_in & CSUM_IP_UDP) csum_flags |= CSUM_INNER_IP_UDP; if (csum_flags_in & CSUM_IP_TCP) csum_flags |= CSUM_INNER_IP_TCP; if (csum_flags_in & CSUM_IP_TSO) csum_flags |= CSUM_INNER_IP_TSO; } else { #ifdef INVARIANTS const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP; MPASS((csum_flags_in & v6) != 0); #endif if (csum_flags_in & CSUM_IP6_UDP) csum_flags |= CSUM_INNER_IP6_UDP; if (csum_flags_in & CSUM_IP6_TCP) csum_flags |= CSUM_INNER_IP6_TCP; if (csum_flags_in & CSUM_IP6_TSO) csum_flags |= CSUM_INNER_IP6_TSO; } return (csum_flags); } #endif static int vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) { #ifdef INET struct ifnet *ifp; struct ip *ip; struct in_addr srcaddr, dstaddr; uint16_t srcport, dstport; int len, mcast, error; struct route route, *ro; struct sockaddr_in *sin; uint32_t csum_flags; NET_EPOCH_ASSERT(); ifp = sc->vxl_ifp; srcaddr = sc->vxl_src_addr.in4.sin_addr; srcport = vxlan_pick_source_port(sc, m); dstaddr = fvxlsa->in4.sin_addr; dstport = fvxlsa->in4.sin_port; M_PREPEND(m, sizeof(struct ip) + sizeof(struct vxlanudphdr), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } len = m->m_pkthdr.len; ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(len); ip->ip_off = 0; ip->ip_ttl = sc->vxl_ttl; ip->ip_p = IPPROTO_UDP; ip->ip_sum = 0; ip->ip_src = srcaddr; ip->ip_dst = dstaddr; vxlan_encap_header(sc, m, sizeof(struct ip), srcport, dstport); mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX; if (m->m_pkthdr.csum_flags != 0) { /* * HW checksum (L3 and/or L4) or TSO has been requested. Look * up the ifnet for the outbound route and verify that the * outbound ifnet can perform the requested operation on the * inner frame. */ bzero(&route, sizeof(route)); ro = &route; sin = (struct sockaddr_in *)&ro->ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; ro->ro_nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE, 0); if (ro->ro_nh == NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EHOSTUNREACH); } csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags, CSUM_ENCAP_VXLAN); if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) != csum_flags) { if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) { const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp; if_printf(ifp, "interface %s is missing hwcaps " "0x%08x, csum_flags 0x%08x -> 0x%08x, " "hwassist 0x%08x\n", nh_ifp->if_xname, csum_flags & ~(uint32_t)nh_ifp->if_hwassist, m->m_pkthdr.csum_flags, csum_flags, (uint32_t)nh_ifp->if_hwassist); } m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } m->m_pkthdr.csum_flags = csum_flags; if (csum_flags & (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) { counter_u64_add(sc->vxl_stats.txcsum, 1); if (csum_flags & CSUM_INNER_TSO) counter_u64_add(sc->vxl_stats.tso, 1); } } else ro = NULL; error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mcast != 0) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); #else m_freem(m); return (ENOTSUP); #endif } static int vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) { #ifdef INET6 struct ifnet *ifp; struct ip6_hdr *ip6; const struct in6_addr *srcaddr, *dstaddr; uint16_t srcport, dstport; int len, mcast, error; struct route_in6 route, *ro; struct sockaddr_in6 *sin6; uint32_t csum_flags; NET_EPOCH_ASSERT(); ifp = sc->vxl_ifp; srcaddr = &sc->vxl_src_addr.in6.sin6_addr; srcport = vxlan_pick_source_port(sc, m); dstaddr = &fvxlsa->in6.sin6_addr; dstport = fvxlsa->in6.sin6_port; M_PREPEND(m, sizeof(struct ip6_hdr) + sizeof(struct vxlanudphdr), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } len = m->m_pkthdr.len; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; /* BMV: Keep in forwarding entry? */ ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = 0; ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_hlim = sc->vxl_ttl; ip6->ip6_src = *srcaddr; ip6->ip6_dst = *dstaddr; vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport); mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); ro = NULL; m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX; if (m->m_pkthdr.csum_flags != 0) { /* * HW checksum (L3 and/or L4) or TSO has been requested. Look * up the ifnet for the outbound route and verify that the * outbound ifnet can perform the requested operation on the * inner frame. */ bzero(&route, sizeof(route)); ro = &route; sin6 = (struct sockaddr_in6 *)&ro->ro_dst; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = ip6->ip6_dst; ro->ro_nh = fib6_lookup(M_GETFIB(m), &ip6->ip6_dst, 0, NHR_NONE, 0); if (ro->ro_nh == NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EHOSTUNREACH); } csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags, CSUM_ENCAP_VXLAN); if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) != csum_flags) { if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) { const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp; if_printf(ifp, "interface %s is missing hwcaps " "0x%08x, csum_flags 0x%08x -> 0x%08x, " "hwassist 0x%08x\n", nh_ifp->if_xname, csum_flags & ~(uint32_t)nh_ifp->if_hwassist, m->m_pkthdr.csum_flags, csum_flags, (uint32_t)nh_ifp->if_hwassist); } m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } m->m_pkthdr.csum_flags = csum_flags; if (csum_flags & (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) { counter_u64_add(sc->vxl_stats.txcsum, 1); if (csum_flags & CSUM_INNER_TSO) counter_u64_add(sc->vxl_stats.tso, 1); } } else if (ntohs(dstport) != V_zero_checksum_port) { struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr)); hdr->uh_sum = in6_cksum_pseudo(ip6, m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mcast != 0) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); #else m_freem(m); return (ENOTSUP); #endif } static int vxlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker tracker; union vxlan_sockaddr vxlsa; struct vxlan_softc *sc; struct vxlan_ftable_entry *fe; struct ifnet *mcifp; struct ether_header *eh; int ipv4, error; sc = ifp->if_softc; eh = mtod(m, struct ether_header *); fe = NULL; mcifp = NULL; ETHER_BPF_MTAP(ifp, m); VXLAN_RLOCK(sc, &tracker); M_SETFIB(m, sc->vxl_fibnum); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { VXLAN_RUNLOCK(sc, &tracker); m_freem(m); return (ENETDOWN); } if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) fe = vxlan_ftable_entry_lookup(sc, eh->ether_dhost); if (fe == NULL) fe = &sc->vxl_default_fe; vxlan_sockaddr_copy(&vxlsa, &fe->vxlfe_raddr.sa); ipv4 = VXLAN_SOCKADDR_IS_IPV4(&vxlsa) != 0; if (vxlan_sockaddr_in_multicast(&vxlsa) != 0) mcifp = vxlan_multicast_if_ref(sc, ipv4); VXLAN_ACQUIRE(sc); VXLAN_RUNLOCK(sc, &tracker); if (ipv4 != 0) error = vxlan_encap4(sc, &vxlsa, m); else error = vxlan_encap6(sc, &vxlsa, m); vxlan_release(sc); if (mcifp != NULL) if_rele(mcifp); return (error); } static void vxlan_qflush(struct ifnet *ifp __unused) { } static bool vxlan_rcv_udp_packet(struct mbuf *m, int offset, struct inpcb *inpcb, const struct sockaddr *srcsa, void *xvso) { struct vxlan_socket *vso; struct vxlan_header *vxh, vxlanhdr; uint32_t vni; int error __unused; M_ASSERTPKTHDR(m); vso = xvso; offset += sizeof(struct udphdr); if (m->m_pkthdr.len < offset + sizeof(struct vxlan_header)) goto out; if (__predict_false(m->m_len < offset + sizeof(struct vxlan_header))) { m_copydata(m, offset, sizeof(struct vxlan_header), (caddr_t) &vxlanhdr); vxh = &vxlanhdr; } else vxh = mtodo(m, offset); /* * Drop if there is a reserved bit set in either the flags or VNI * fields of the header. This goes against the specification, but * a bit set may indicate an unsupported new feature. This matches * the behavior of the Linux implementation. */ if (vxh->vxlh_flags != htonl(VXLAN_HDR_FLAGS_VALID_VNI) || vxh->vxlh_vni & ~VXLAN_VNI_MASK) goto out; vni = ntohl(vxh->vxlh_vni) >> VXLAN_HDR_VNI_SHIFT; /* Adjust to the start of the inner Ethernet frame. */ m_adj_decap(m, offset + sizeof(struct vxlan_header)); error = vxlan_input(vso, vni, &m, srcsa); MPASS(error != 0 || m == NULL); out: if (m != NULL) m_freem(m); return (true); } static int vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0, const struct sockaddr *sa) { struct vxlan_softc *sc; struct ifnet *ifp; struct mbuf *m; struct ether_header *eh; int error; m = *m0; if (m->m_pkthdr.len < ETHER_HDR_LEN) return (EINVAL); sc = vxlan_socket_lookup_softc(vso, vni); if (sc == NULL) return (ENOENT); ifp = sc->vxl_ifp; eh = mtod(m, struct ether_header *); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { error = ENETDOWN; goto out; } else if (ifp == m->m_pkthdr.rcvif) { /* XXX Does not catch more complex loops. */ error = EDEADLK; goto out; } if (sc->vxl_flags & VXLAN_FLAG_LEARN) vxlan_ftable_learn(sc, sa, eh->ether_shost); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); if (((ifp->if_capenable & IFCAP_RXCSUM && m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) || (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && !(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) { uint32_t csum_flags = 0; if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) csum_flags |= CSUM_L3_CALC; if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID) csum_flags |= CSUM_L3_VALID; if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC) csum_flags |= CSUM_L4_CALC; if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID) csum_flags |= CSUM_L4_VALID; m->m_pkthdr.csum_flags = csum_flags; counter_u64_add(sc->vxl_stats.rxcsum, 1); } else { /* clear everything */ m->m_pkthdr.csum_flags = 0; m->m_pkthdr.csum_data = 0; } (*ifp->if_input)(ifp, m); *m0 = NULL; error = 0; out: vxlan_release(sc); return (error); } static int vxlan_stats_alloc(struct vxlan_softc *sc) { struct vxlan_statistics *stats = &sc->vxl_stats; stats->txcsum = counter_u64_alloc(M_WAITOK); if (stats->txcsum == NULL) goto failed; stats->tso = counter_u64_alloc(M_WAITOK); if (stats->tso == NULL) goto failed; stats->rxcsum = counter_u64_alloc(M_WAITOK); if (stats->rxcsum == NULL) goto failed; return (0); failed: vxlan_stats_free(sc); return (ENOMEM); } static void vxlan_stats_free(struct vxlan_softc *sc) { struct vxlan_statistics *stats = &sc->vxl_stats; if (stats->txcsum != NULL) { counter_u64_free(stats->txcsum); stats->txcsum = NULL; } if (stats->tso != NULL) { counter_u64_free(stats->tso); stats->tso = NULL; } if (stats->rxcsum != NULL) { counter_u64_free(stats->rxcsum); stats->rxcsum = NULL; } } static void vxlan_set_default_config(struct vxlan_softc *sc) { sc->vxl_flags |= VXLAN_FLAG_LEARN; sc->vxl_vni = VXLAN_VNI_MAX; sc->vxl_ttl = IPDEFTTL; if (!vxlan_tunable_int(sc, "legacy_port", vxlan_legacy_port)) { sc->vxl_src_addr.in4.sin_port = htons(VXLAN_PORT); sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_PORT); } else { sc->vxl_src_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT); sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT); } sc->vxl_min_port = V_ipport_firstauto; sc->vxl_max_port = V_ipport_lastauto; sc->vxl_ftable_max = VXLAN_FTABLE_MAX; sc->vxl_ftable_timeout = VXLAN_FTABLE_TIMEOUT; } static int vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp) { #ifndef INET if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR4 | VXLAN_PARAM_WITH_REMOTE_ADDR4)) return (EAFNOSUPPORT); #endif #ifndef INET6 if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR6 | VXLAN_PARAM_WITH_REMOTE_ADDR6)) return (EAFNOSUPPORT); #else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) { int error = vxlan_sockaddr_in6_embedscope(&vxlp->vxlp_local_sa); if (error) return (error); } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) { int error = vxlan_sockaddr_in6_embedscope( &vxlp->vxlp_remote_sa); if (error) return (error); } #endif if (vxlp->vxlp_with & VXLAN_PARAM_WITH_VNI) { if (vxlan_check_vni(vxlp->vxlp_vni) == 0) sc->vxl_vni = vxlp->vxlp_vni; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR4) { sc->vxl_src_addr.in4.sin_len = sizeof(struct sockaddr_in); sc->vxl_src_addr.in4.sin_family = AF_INET; sc->vxl_src_addr.in4.sin_addr = vxlp->vxlp_local_sa.in4.sin_addr; } else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) { sc->vxl_src_addr.in6.sin6_len = sizeof(struct sockaddr_in6); sc->vxl_src_addr.in6.sin6_family = AF_INET6; sc->vxl_src_addr.in6.sin6_addr = vxlp->vxlp_local_sa.in6.sin6_addr; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR4) { sc->vxl_dst_addr.in4.sin_len = sizeof(struct sockaddr_in); sc->vxl_dst_addr.in4.sin_family = AF_INET; sc->vxl_dst_addr.in4.sin_addr = vxlp->vxlp_remote_sa.in4.sin_addr; } else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) { sc->vxl_dst_addr.in6.sin6_len = sizeof(struct sockaddr_in6); sc->vxl_dst_addr.in6.sin6_family = AF_INET6; sc->vxl_dst_addr.in6.sin6_addr = vxlp->vxlp_remote_sa.in6.sin6_addr; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_PORT) sc->vxl_src_addr.in4.sin_port = htons(vxlp->vxlp_local_port); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_PORT) sc->vxl_dst_addr.in4.sin_port = htons(vxlp->vxlp_remote_port); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_PORT_RANGE) { if (vxlp->vxlp_min_port <= vxlp->vxlp_max_port) { sc->vxl_min_port = vxlp->vxlp_min_port; sc->vxl_max_port = vxlp->vxlp_max_port; } } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_MULTICAST_IF) strlcpy(sc->vxl_mc_ifname, vxlp->vxlp_mc_ifname, IFNAMSIZ); if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_TIMEOUT) { if (vxlan_check_ftable_timeout(vxlp->vxlp_ftable_timeout) == 0) sc->vxl_ftable_timeout = vxlp->vxlp_ftable_timeout; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_MAX) { if (vxlan_check_ftable_max(vxlp->vxlp_ftable_max) == 0) sc->vxl_ftable_max = vxlp->vxlp_ftable_max; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_TTL) { if (vxlan_check_ttl(vxlp->vxlp_ttl) == 0) sc->vxl_ttl = vxlp->vxlp_ttl; } if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LEARN) { if (vxlp->vxlp_learn == 0) sc->vxl_flags &= ~VXLAN_FLAG_LEARN; } return (0); } static int vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap) { int mask = reqcap ^ ifp->if_capenable; /* Disable TSO if tx checksums are disabled. */ if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) && reqcap & IFCAP_TSO4) { reqcap &= ~IFCAP_TSO4; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) && reqcap & IFCAP_TSO6) { reqcap &= ~IFCAP_TSO6; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } /* Do not enable TSO if tx checksums are disabled. */ if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 && !(reqcap & IFCAP_TXCSUM)) { if_printf(ifp, "enable txcsum first.\n"); return (EAGAIN); } if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 && !(reqcap & IFCAP_TXCSUM_IPV6)) { if_printf(ifp, "enable txcsum6 first.\n"); return (EAGAIN); } sc->vxl_reqcap = reqcap; return (0); } /* * A VXLAN interface inherits the capabilities of the vxlandev or the interface * hosting the vxlanlocal address. */ static void vxlan_set_hwcaps(struct vxlan_softc *sc) { struct epoch_tracker et; struct ifnet *p; struct ifaddr *ifa; u_long hwa; int cap, ena; bool rel; struct ifnet *ifp = sc->vxl_ifp; /* reset caps */ ifp->if_capabilities &= VXLAN_BASIC_IFCAPS; ifp->if_capenable &= VXLAN_BASIC_IFCAPS; ifp->if_hwassist = 0; NET_EPOCH_ENTER(et); CURVNET_SET(ifp->if_vnet); rel = false; p = NULL; if (sc->vxl_mc_ifname[0] != '\0') { rel = true; p = ifunit_ref(sc->vxl_mc_ifname); } else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) { if (sc->vxl_src_addr.sa.sa_family == AF_INET) { struct sockaddr_in in4 = sc->vxl_src_addr.in4; in4.sin_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *)&in4); if (ifa != NULL) p = ifa->ifa_ifp; } else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) { struct sockaddr_in6 in6 = sc->vxl_src_addr.in6; in6.sin6_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *)&in6); if (ifa != NULL) p = ifa->ifa_ifp; } } if (p == NULL) goto done; cap = ena = hwa = 0; /* checksum offload */ if (p->if_capabilities & IFCAP_VXLAN_HWCSUM) cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (p->if_capenable & IFCAP_VXLAN_HWCSUM) { ena |= sc->vxl_reqcap & p->if_capenable & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (ena & IFCAP_TXCSUM) { if (p->if_hwassist & CSUM_INNER_IP) hwa |= CSUM_IP; if (p->if_hwassist & CSUM_INNER_IP_UDP) hwa |= CSUM_IP_UDP; if (p->if_hwassist & CSUM_INNER_IP_TCP) hwa |= CSUM_IP_TCP; } if (ena & IFCAP_TXCSUM_IPV6) { if (p->if_hwassist & CSUM_INNER_IP6_UDP) hwa |= CSUM_IP6_UDP; if (p->if_hwassist & CSUM_INNER_IP6_TCP) hwa |= CSUM_IP6_TCP; } } /* hardware TSO */ if (p->if_capabilities & IFCAP_VXLAN_HWTSO) { cap |= p->if_capabilities & IFCAP_TSO; if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen) ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen; else ifp->if_hw_tsomax = p->if_hw_tsomax; /* XXX: tsomaxsegcount decrement is cxgbe specific */ ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1; ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize; } if (p->if_capenable & IFCAP_VXLAN_HWTSO) { ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO; if (ena & IFCAP_TSO) { if (p->if_hwassist & CSUM_INNER_IP_TSO) hwa |= CSUM_IP_TSO; if (p->if_hwassist & CSUM_INNER_IP6_TSO) hwa |= CSUM_IP6_TSO; } } ifp->if_capabilities |= cap; ifp->if_capenable |= ena; ifp->if_hwassist |= hwa; if (rel) if_rele(p); done: CURVNET_RESTORE(); NET_EPOCH_EXIT(et); } static int -vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) +vxlan_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct vxlan_softc *sc; struct ifnet *ifp; struct ifvxlanparam vxlp; int error; sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO); - sc->vxl_unit = unit; + sc->vxl_unit = ifd->unit; sc->vxl_fibnum = curthread->td_proc->p_fibnum; vxlan_set_default_config(sc); error = vxlan_stats_alloc(sc); if (error != 0) goto fail; - if (params != 0) { - error = copyin(params, &vxlp, sizeof(vxlp)); + if (ifd->params != NULL) { + error = ifc_copyin(ifd, &vxlp, sizeof(vxlp)); if (error) goto fail; error = vxlan_set_user_config(sc, &vxlp); if (error) goto fail; } ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { error = ENOSPC; goto fail; } sc->vxl_ifp = ifp; rm_init(&sc->vxl_lock, "vxlanrm"); callout_init_rw(&sc->vxl_callout, &sc->vxl_lock, 0); sc->vxl_port_hash_key = arc4random(); vxlan_ftable_init(sc); vxlan_sysctl_setup(sc); ifp->if_softc = sc; - if_initname(ifp, vxlan_name, unit); + if_initname(ifp, vxlan_name, ifd->unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = vxlan_init; ifp->if_ioctl = vxlan_ioctl; ifp->if_transmit = vxlan_transmit; ifp->if_qflush = vxlan_qflush; ifp->if_capabilities = VXLAN_BASIC_IFCAPS; ifp->if_capenable = VXLAN_BASIC_IFCAPS; sc->vxl_reqcap = -1; vxlan_set_hwcaps(sc); ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status); ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->vxl_media, IFM_ETHER | IFM_AUTO); ether_gen_addr(ifp, &sc->vxl_hwaddr); ether_ifattach(ifp, sc->vxl_hwaddr.octet); ifp->if_baudrate = 0; VXLAN_WLOCK(sc); vxlan_setup_interface_hdrlen(sc); VXLAN_WUNLOCK(sc); + *ifpp = ifp; return (0); fail: free(sc, M_VXLAN); return (error); } -static void -vxlan_clone_destroy(struct ifnet *ifp) +static int +vxlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct vxlan_softc *sc; sc = ifp->if_softc; vxlan_teardown(sc); vxlan_ftable_flush(sc, 1); ether_ifdetach(ifp); if_free(ifp); ifmedia_removeall(&sc->vxl_media); vxlan_ftable_fini(sc); vxlan_sysctl_destroy(sc); rm_destroy(&sc->vxl_lock); vxlan_stats_free(sc); free(sc, M_VXLAN); + + return (0); } /* BMV: Taken from if_bridge. */ static uint32_t vxlan_mac_hash(struct vxlan_softc *sc, const uint8_t *addr) { uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->vxl_ftable_hash_key; b += addr[5] << 8; b += addr[4]; a += addr[3] << 24; a += addr[2] << 16; a += addr[1] << 8; a += addr[0]; /* * The following hash function is adapted from "Hash Functions" by Bob Jenkins * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). */ #define mix(a, b, c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (0) mix(a, b, c); #undef mix return (c); } static int vxlan_media_change(struct ifnet *ifp) { /* Ignore. */ return (0); } static void vxlan_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID; ifmr->ifm_active = IFM_ETHER | IFM_FDX; } static int vxlan_sockaddr_cmp(const union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { return (bcmp(&vxladdr->sa, sa, vxladdr->sa.sa_len)); } static void vxlan_sockaddr_copy(union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6); bzero(vxladdr, sizeof(*vxladdr)); if (sa->sa_family == AF_INET) { vxladdr->in4 = *satoconstsin(sa); vxladdr->in4.sin_len = sizeof(struct sockaddr_in); } else if (sa->sa_family == AF_INET6) { vxladdr->in6 = *satoconstsin6(sa); vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6); } } static int vxlan_sockaddr_in_equal(const union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { int equal; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; equal = in4->s_addr == vxladdr->in4.sin_addr.s_addr; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; equal = IN6_ARE_ADDR_EQUAL(in6, &vxladdr->in6.sin6_addr); } else equal = 0; return (equal); } static void vxlan_sockaddr_in_copy(union vxlan_sockaddr *vxladdr, const struct sockaddr *sa) { MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6); if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; vxladdr->in4.sin_family = AF_INET; vxladdr->in4.sin_len = sizeof(struct sockaddr_in); vxladdr->in4.sin_addr = *in4; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; vxladdr->in6.sin6_family = AF_INET6; vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6); vxladdr->in6.sin6_addr = *in6; } } static int vxlan_sockaddr_supported(const union vxlan_sockaddr *vxladdr, int unspec) { const struct sockaddr *sa; int supported; sa = &vxladdr->sa; supported = 0; if (sa->sa_family == AF_UNSPEC && unspec != 0) { supported = 1; } else if (sa->sa_family == AF_INET) { #ifdef INET supported = 1; #endif } else if (sa->sa_family == AF_INET6) { #ifdef INET6 supported = 1; #endif } return (supported); } static int vxlan_sockaddr_in_any(const union vxlan_sockaddr *vxladdr) { const struct sockaddr *sa; int any; sa = &vxladdr->sa; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; any = in4->s_addr == INADDR_ANY; } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; any = IN6_IS_ADDR_UNSPECIFIED(in6); } else any = -1; return (any); } static int vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *vxladdr) { const struct sockaddr *sa; int mc; sa = &vxladdr->sa; if (sa->sa_family == AF_INET) { const struct in_addr *in4 = &satoconstsin(sa)->sin_addr; mc = IN_MULTICAST(ntohl(in4->s_addr)); } else if (sa->sa_family == AF_INET6) { const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr; mc = IN6_IS_ADDR_MULTICAST(in6); } else mc = -1; return (mc); } static int vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *vxladdr) { int error; MPASS(VXLAN_SOCKADDR_IS_IPV6(vxladdr)); #ifdef INET6 error = sa6_embedscope(&vxladdr->in6, V_ip6_use_defzone); #else error = EAFNOSUPPORT; #endif return (error); } static int vxlan_can_change_config(struct vxlan_softc *sc) { struct ifnet *ifp; ifp = sc->vxl_ifp; VXLAN_LOCK_ASSERT(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return (0); if (sc->vxl_flags & (VXLAN_FLAG_INIT | VXLAN_FLAG_TEARDOWN)) return (0); return (1); } static int vxlan_check_vni(uint32_t vni) { return (vni >= VXLAN_VNI_MAX); } static int vxlan_check_ttl(int ttl) { return (ttl > MAXTTL); } static int vxlan_check_ftable_timeout(uint32_t timeout) { return (timeout > VXLAN_FTABLE_MAX_TIMEOUT); } static int vxlan_check_ftable_max(uint32_t max) { return (max > VXLAN_FTABLE_MAX); } static void vxlan_sysctl_setup(struct vxlan_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *node; struct vxlan_statistics *stats; char namebuf[8]; ctx = &sc->vxl_sysctl_ctx; stats = &sc->vxl_stats; snprintf(namebuf, sizeof(namebuf), "%d", sc->vxl_unit); sysctl_ctx_init(ctx); sc->vxl_sysctl_node = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_net_link_vxlan), OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node), OID_AUTO, "ftable", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "count", CTLFLAG_RD, &sc->vxl_ftable_cnt, 0, "Number of entries in forwarding table"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "max", CTLFLAG_RD, &sc->vxl_ftable_max, 0, "Maximum number of entries allowed in forwarding table"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "timeout", CTLFLAG_RD, &sc->vxl_ftable_timeout, 0, "Number of seconds between prunes of the forwarding table"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "dump", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, sc, 0, vxlan_ftable_sysctl_dump, "A", "Dump the forwarding table entries"); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "ftable_nospace", CTLFLAG_RD, &stats->ftable_nospace, 0, "Fowarding table reached maximum entries"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "ftable_lock_upgrade_failed", CTLFLAG_RD, &stats->ftable_lock_upgrade_failed, 0, "Forwarding table update required lock upgrade"); SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum", CTLFLAG_RD, &stats->txcsum, "# of times hardware assisted with tx checksum"); SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso", CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO"); SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum", CTLFLAG_RD, &stats->rxcsum, "# of times hardware assisted with rx checksum"); } static void vxlan_sysctl_destroy(struct vxlan_softc *sc) { sysctl_ctx_free(&sc->vxl_sysctl_ctx); sc->vxl_sysctl_node = NULL; } static int vxlan_tunable_int(struct vxlan_softc *sc, const char *knob, int def) { char path[64]; snprintf(path, sizeof(path), "net.link.vxlan.%d.%s", sc->vxl_unit, knob); TUNABLE_INT_FETCH(path, &def); return (def); } static void vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp) { struct vxlan_softc_head list; struct vxlan_socket *vso; struct vxlan_softc *sc, *tsc; LIST_INIT(&list); if (ifp->if_flags & IFF_RENAMING) return; if ((ifp->if_flags & IFF_MULTICAST) == 0) return; VXLAN_LIST_LOCK(); LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) vxlan_socket_ifdetach(vso, ifp, &list); VXLAN_LIST_UNLOCK(); LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) { LIST_REMOVE(sc, vxl_ifdetach_list); sx_xlock(&vxlan_sx); VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_INIT) vxlan_init_wait(sc); vxlan_teardown_locked(sc); sx_xunlock(&vxlan_sx); } } static void vxlan_load(void) { mtx_init(&vxlan_list_mtx, "vxlan list", NULL, MTX_DEF); LIST_INIT(&vxlan_socket_list); vxlan_ifdetach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vxlan_ifdetach_event, NULL, EVENTHANDLER_PRI_ANY); - vxlan_cloner = if_clone_simple(vxlan_name, vxlan_clone_create, - vxlan_clone_destroy, 0); + + struct if_clone_addreq req = { + .create_f = vxlan_clone_create, + .destroy_f = vxlan_clone_destroy, + .flags = IFC_F_AUTOUNIT, + }; + vxlan_cloner = ifc_attach_cloner(vxlan_name, &req); } static void vxlan_unload(void) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, vxlan_ifdetach_event_tag); - if_clone_detach(vxlan_cloner); + ifc_detach_cloner(vxlan_cloner); mtx_destroy(&vxlan_list_mtx); MPASS(LIST_EMPTY(&vxlan_socket_list)); } static int vxlan_modevent(module_t mod, int type, void *unused) { int error; error = 0; switch (type) { case MOD_LOAD: vxlan_load(); break; case MOD_UNLOAD: vxlan_unload(); break; default: error = ENOTSUP; break; } return (error); } static moduledata_t vxlan_mod = { "if_vxlan", vxlan_modevent, 0 }; DECLARE_MODULE(if_vxlan, vxlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vxlan, 1); diff --git a/sys/net80211/ieee80211_freebsd.c b/sys/net80211/ieee80211_freebsd.c index 5511791ae2aa..ec970e217dfd 100644 --- a/sys/net80211/ieee80211_freebsd.c +++ b/sys/net80211/ieee80211_freebsd.c @@ -1,1185 +1,1194 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * IEEE 802.11 support (FreeBSD-specific code) */ #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include DEBUGNET_DEFINE(ieee80211); SYSCTL_NODE(_net, OID_AUTO, wlan, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "IEEE 80211 parameters"); #ifdef IEEE80211_DEBUG static int ieee80211_debug = 0; SYSCTL_INT(_net_wlan, OID_AUTO, debug, CTLFLAG_RW, &ieee80211_debug, 0, "debugging printfs"); #endif static const char wlanname[] = "wlan"; static struct if_clone *wlan_cloner; /* * priv(9) NET80211 checks. * Return 0 if operation is allowed, E* (usually EPERM) otherwise. */ int ieee80211_priv_check_vap_getkey(u_long cmd __unused, struct ieee80211vap *vap __unused, struct ifnet *ifp __unused) { return (priv_check(curthread, PRIV_NET80211_VAP_GETKEY)); } int ieee80211_priv_check_vap_manage(u_long cmd __unused, struct ieee80211vap *vap __unused, struct ifnet *ifp __unused) { return (priv_check(curthread, PRIV_NET80211_VAP_MANAGE)); } int ieee80211_priv_check_vap_setmac(u_long cmd __unused, struct ieee80211vap *vap __unused, struct ifnet *ifp __unused) { return (priv_check(curthread, PRIV_NET80211_VAP_SETMAC)); } int ieee80211_priv_check_create_vap(u_long cmd __unused, struct ieee80211vap *vap __unused, struct ifnet *ifp __unused) { return (priv_check(curthread, PRIV_NET80211_CREATE_VAP)); } static int -wlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) +wlan_clone_create(struct if_clone *ifc, char *name, size_t len, + struct ifc_data *ifd, struct ifnet **ifpp) { struct ieee80211_clone_params cp; struct ieee80211vap *vap; struct ieee80211com *ic; int error; error = ieee80211_priv_check_create_vap(0, NULL, NULL); if (error) return error; - error = copyin(params, &cp, sizeof(cp)); + error = ifc_copyin(ifd, &cp, sizeof(cp)); if (error) return error; ic = ieee80211_find_com(cp.icp_parent); if (ic == NULL) return ENXIO; if (cp.icp_opmode >= IEEE80211_OPMODE_MAX) { ic_printf(ic, "%s: invalid opmode %d\n", __func__, cp.icp_opmode); return EINVAL; } if ((ic->ic_caps & ieee80211_opcap[cp.icp_opmode]) == 0) { ic_printf(ic, "%s mode not supported\n", ieee80211_opmode_name[cp.icp_opmode]); return EOPNOTSUPP; } if ((cp.icp_flags & IEEE80211_CLONE_TDMA) && #ifdef IEEE80211_SUPPORT_TDMA (ic->ic_caps & IEEE80211_C_TDMA) == 0 #else (1) #endif ) { ic_printf(ic, "TDMA not supported\n"); return EOPNOTSUPP; } - vap = ic->ic_vap_create(ic, wlanname, unit, + vap = ic->ic_vap_create(ic, wlanname, ifd->unit, cp.icp_opmode, cp.icp_flags, cp.icp_bssid, cp.icp_flags & IEEE80211_CLONE_MACADDR ? cp.icp_macaddr : ic->ic_macaddr); if (vap == NULL) return (EIO); #ifdef DEBUGNET if (ic->ic_debugnet_meth != NULL) DEBUGNET_SET(vap->iv_ifp, ieee80211); #endif + *ifpp = vap->iv_ifp; + return (0); } -static void -wlan_clone_destroy(struct ifnet *ifp) +static int +wlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct ieee80211vap *vap = ifp->if_softc; struct ieee80211com *ic = vap->iv_ic; ic->ic_vap_delete(vap); + + return (0); } void ieee80211_vap_destroy(struct ieee80211vap *vap) { CURVNET_SET(vap->iv_ifp->if_vnet); if_clone_destroyif(wlan_cloner, vap->iv_ifp); CURVNET_RESTORE(); } int ieee80211_sysctl_msecs_ticks(SYSCTL_HANDLER_ARGS) { int msecs = ticks_to_msecs(*(int *)arg1); int error; error = sysctl_handle_int(oidp, &msecs, 0, req); if (error || !req->newptr) return error; *(int *)arg1 = msecs_to_ticks(msecs); return 0; } static int ieee80211_sysctl_inact(SYSCTL_HANDLER_ARGS) { int inact = (*(int *)arg1) * IEEE80211_INACT_WAIT; int error; error = sysctl_handle_int(oidp, &inact, 0, req); if (error || !req->newptr) return error; *(int *)arg1 = inact / IEEE80211_INACT_WAIT; return 0; } static int ieee80211_sysctl_parent(SYSCTL_HANDLER_ARGS) { struct ieee80211com *ic = arg1; return SYSCTL_OUT_STR(req, ic->ic_name); } static int ieee80211_sysctl_radar(SYSCTL_HANDLER_ARGS) { struct ieee80211com *ic = arg1; int t = 0, error; error = sysctl_handle_int(oidp, &t, 0, req); if (error || !req->newptr) return error; IEEE80211_LOCK(ic); ieee80211_dfs_notify_radar(ic, ic->ic_curchan); IEEE80211_UNLOCK(ic); return 0; } /* * For now, just restart everything. * * Later on, it'd be nice to have a separate VAP restart to * full-device restart. */ static int ieee80211_sysctl_vap_restart(SYSCTL_HANDLER_ARGS) { struct ieee80211vap *vap = arg1; int t = 0, error; error = sysctl_handle_int(oidp, &t, 0, req); if (error || !req->newptr) return error; ieee80211_restart_all(vap->iv_ic); return 0; } void ieee80211_sysctl_attach(struct ieee80211com *ic) { } void ieee80211_sysctl_detach(struct ieee80211com *ic) { } void ieee80211_sysctl_vattach(struct ieee80211vap *vap) { struct ifnet *ifp = vap->iv_ifp; struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; char num[14]; /* sufficient for 32 bits */ ctx = (struct sysctl_ctx_list *) IEEE80211_MALLOC(sizeof(struct sysctl_ctx_list), M_DEVBUF, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); if (ctx == NULL) { if_printf(ifp, "%s: cannot allocate sysctl context!\n", __func__); return; } sysctl_ctx_init(ctx); snprintf(num, sizeof(num), "%u", ifp->if_dunit); oid = SYSCTL_ADD_NODE(ctx, &SYSCTL_NODE_CHILDREN(_net, wlan), OID_AUTO, num, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, vap->iv_ic, 0, ieee80211_sysctl_parent, "A", "parent device"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "driver_caps", CTLFLAG_RW, &vap->iv_caps, 0, "driver capabilities"); #ifdef IEEE80211_DEBUG vap->iv_debug = ieee80211_debug; SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "debug", CTLFLAG_RW, &vap->iv_debug, 0, "control debugging printfs"); #endif SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "bmiss_max", CTLFLAG_RW, &vap->iv_bmiss_max, 0, "consecutive beacon misses before scanning"); /* XXX inherit from tunables */ SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "inact_run", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &vap->iv_inact_run, 0, ieee80211_sysctl_inact, "I", "station inactivity timeout (sec)"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "inact_probe", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &vap->iv_inact_probe, 0, ieee80211_sysctl_inact, "I", "station inactivity probe timeout (sec)"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "inact_auth", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &vap->iv_inact_auth, 0, ieee80211_sysctl_inact, "I", "station authentication timeout (sec)"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "inact_init", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &vap->iv_inact_init, 0, ieee80211_sysctl_inact, "I", "station initial state timeout (sec)"); if (vap->iv_htcaps & IEEE80211_HTC_HT) { SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "ampdu_mintraffic_bk", CTLFLAG_RW, &vap->iv_ampdu_mintraffic[WME_AC_BK], 0, "BK traffic tx aggr threshold (pps)"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "ampdu_mintraffic_be", CTLFLAG_RW, &vap->iv_ampdu_mintraffic[WME_AC_BE], 0, "BE traffic tx aggr threshold (pps)"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "ampdu_mintraffic_vo", CTLFLAG_RW, &vap->iv_ampdu_mintraffic[WME_AC_VO], 0, "VO traffic tx aggr threshold (pps)"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "ampdu_mintraffic_vi", CTLFLAG_RW, &vap->iv_ampdu_mintraffic[WME_AC_VI], 0, "VI traffic tx aggr threshold (pps)"); } SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "force_restart", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, vap, 0, ieee80211_sysctl_vap_restart, "I", "force a VAP restart"); if (vap->iv_caps & IEEE80211_C_DFS) { SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "radar", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, vap->iv_ic, 0, ieee80211_sysctl_radar, "I", "simulate radar event"); } vap->iv_sysctl = ctx; vap->iv_oid = oid; } void ieee80211_sysctl_vdetach(struct ieee80211vap *vap) { if (vap->iv_sysctl != NULL) { sysctl_ctx_free(vap->iv_sysctl); IEEE80211_FREE(vap->iv_sysctl, M_DEVBUF); vap->iv_sysctl = NULL; } } int ieee80211_com_vincref(struct ieee80211vap *vap) { uint32_t ostate; ostate = atomic_fetchadd_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD); if (ostate & IEEE80211_COM_DETACHED) { atomic_subtract_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD); return (ENETDOWN); } if (_IEEE80211_MASKSHIFT(ostate, IEEE80211_COM_REF) == IEEE80211_COM_REF_MAX) { atomic_subtract_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD); return (EOVERFLOW); } return (0); } void ieee80211_com_vdecref(struct ieee80211vap *vap) { uint32_t ostate; ostate = atomic_fetchadd_32(&vap->iv_com_state, -IEEE80211_COM_REF_ADD); KASSERT(_IEEE80211_MASKSHIFT(ostate, IEEE80211_COM_REF) != 0, ("com reference counter underflow")); (void) ostate; } void ieee80211_com_vdetach(struct ieee80211vap *vap) { int sleep_time; sleep_time = msecs_to_ticks(250); atomic_set_32(&vap->iv_com_state, IEEE80211_COM_DETACHED); while (_IEEE80211_MASKSHIFT(atomic_load_32(&vap->iv_com_state), IEEE80211_COM_REF) != 0) pause("comref", sleep_time); } int ieee80211_node_dectestref(struct ieee80211_node *ni) { /* XXX need equivalent of atomic_dec_and_test */ atomic_subtract_int(&ni->ni_refcnt, 1); return atomic_cmpset_int(&ni->ni_refcnt, 0, 1); } void ieee80211_drain_ifq(struct ifqueue *ifq) { struct ieee80211_node *ni; struct mbuf *m; for (;;) { IF_DEQUEUE(ifq, m); if (m == NULL) break; ni = (struct ieee80211_node *)m->m_pkthdr.rcvif; KASSERT(ni != NULL, ("frame w/o node")); ieee80211_free_node(ni); m->m_pkthdr.rcvif = NULL; m_freem(m); } } void ieee80211_flush_ifq(struct ifqueue *ifq, struct ieee80211vap *vap) { struct ieee80211_node *ni; struct mbuf *m, **mprev; IF_LOCK(ifq); mprev = &ifq->ifq_head; while ((m = *mprev) != NULL) { ni = (struct ieee80211_node *)m->m_pkthdr.rcvif; if (ni != NULL && ni->ni_vap == vap) { *mprev = m->m_nextpkt; /* remove from list */ ifq->ifq_len--; m_freem(m); ieee80211_free_node(ni); /* reclaim ref */ } else mprev = &m->m_nextpkt; } /* recalculate tail ptr */ m = ifq->ifq_head; for (; m != NULL && m->m_nextpkt != NULL; m = m->m_nextpkt) ; ifq->ifq_tail = m; IF_UNLOCK(ifq); } /* * As above, for mbufs allocated with m_gethdr/MGETHDR * or initialized by M_COPY_PKTHDR. */ #define MC_ALIGN(m, len) \ do { \ (m)->m_data += rounddown2(MCLBYTES - (len), sizeof(long)); \ } while (/* CONSTCOND */ 0) /* * Allocate and setup a management frame of the specified * size. We return the mbuf and a pointer to the start * of the contiguous data area that's been reserved based * on the packet length. The data area is forced to 32-bit * alignment and the buffer length to a multiple of 4 bytes. * This is done mainly so beacon frames (that require this) * can use this interface too. */ struct mbuf * ieee80211_getmgtframe(uint8_t **frm, int headroom, int pktlen) { struct mbuf *m; u_int len; /* * NB: we know the mbuf routines will align the data area * so we don't need to do anything special. */ len = roundup2(headroom + pktlen, 4); KASSERT(len <= MCLBYTES, ("802.11 mgt frame too large: %u", len)); if (len < MINCLSIZE) { m = m_gethdr(IEEE80211_M_NOWAIT, MT_DATA); /* * Align the data in case additional headers are added. * This should only happen when a WEP header is added * which only happens for shared key authentication mgt * frames which all fit in MHLEN. */ if (m != NULL) M_ALIGN(m, len); } else { m = m_getcl(IEEE80211_M_NOWAIT, MT_DATA, M_PKTHDR); if (m != NULL) MC_ALIGN(m, len); } if (m != NULL) { m->m_data += headroom; *frm = m->m_data; } return m; } #ifndef __NO_STRICT_ALIGNMENT /* * Re-align the payload in the mbuf. This is mainly used (right now) * to handle IP header alignment requirements on certain architectures. */ struct mbuf * ieee80211_realign(struct ieee80211vap *vap, struct mbuf *m, size_t align) { int pktlen, space; struct mbuf *n; pktlen = m->m_pkthdr.len; space = pktlen + align; if (space < MINCLSIZE) n = m_gethdr(IEEE80211_M_NOWAIT, MT_DATA); else { n = m_getjcl(IEEE80211_M_NOWAIT, MT_DATA, M_PKTHDR, space <= MCLBYTES ? MCLBYTES : #if MJUMPAGESIZE != MCLBYTES space <= MJUMPAGESIZE ? MJUMPAGESIZE : #endif space <= MJUM9BYTES ? MJUM9BYTES : MJUM16BYTES); } if (__predict_true(n != NULL)) { m_move_pkthdr(n, m); n->m_data = (caddr_t)(ALIGN(n->m_data + align) - align); m_copydata(m, 0, pktlen, mtod(n, caddr_t)); n->m_len = pktlen; } else { IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY, mtod(m, const struct ieee80211_frame *), NULL, "%s", "no mbuf to realign"); vap->iv_stats.is_rx_badalign++; } m_freem(m); return n; } #endif /* !__NO_STRICT_ALIGNMENT */ int ieee80211_add_callback(struct mbuf *m, void (*func)(struct ieee80211_node *, void *, int), void *arg) { struct m_tag *mtag; struct ieee80211_cb *cb; mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_CALLBACK, sizeof(struct ieee80211_cb), IEEE80211_M_NOWAIT); if (mtag == NULL) return 0; cb = (struct ieee80211_cb *)(mtag+1); cb->func = func; cb->arg = arg; m_tag_prepend(m, mtag); m->m_flags |= M_TXCB; return 1; } int ieee80211_add_xmit_params(struct mbuf *m, const struct ieee80211_bpf_params *params) { struct m_tag *mtag; struct ieee80211_tx_params *tx; mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_XMIT_PARAMS, sizeof(struct ieee80211_tx_params), IEEE80211_M_NOWAIT); if (mtag == NULL) return (0); tx = (struct ieee80211_tx_params *)(mtag+1); memcpy(&tx->params, params, sizeof(struct ieee80211_bpf_params)); m_tag_prepend(m, mtag); return (1); } int ieee80211_get_xmit_params(struct mbuf *m, struct ieee80211_bpf_params *params) { struct m_tag *mtag; struct ieee80211_tx_params *tx; mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_XMIT_PARAMS, NULL); if (mtag == NULL) return (-1); tx = (struct ieee80211_tx_params *)(mtag + 1); memcpy(params, &tx->params, sizeof(struct ieee80211_bpf_params)); return (0); } void ieee80211_process_callback(struct ieee80211_node *ni, struct mbuf *m, int status) { struct m_tag *mtag; mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_CALLBACK, NULL); if (mtag != NULL) { struct ieee80211_cb *cb = (struct ieee80211_cb *)(mtag+1); cb->func(ni, cb->arg, status); } } /* * Add RX parameters to the given mbuf. * * Returns 1 if OK, 0 on error. */ int ieee80211_add_rx_params(struct mbuf *m, const struct ieee80211_rx_stats *rxs) { struct m_tag *mtag; struct ieee80211_rx_params *rx; mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_RECV_PARAMS, sizeof(struct ieee80211_rx_stats), IEEE80211_M_NOWAIT); if (mtag == NULL) return (0); rx = (struct ieee80211_rx_params *)(mtag + 1); memcpy(&rx->params, rxs, sizeof(*rxs)); m_tag_prepend(m, mtag); return (1); } int ieee80211_get_rx_params(struct mbuf *m, struct ieee80211_rx_stats *rxs) { struct m_tag *mtag; struct ieee80211_rx_params *rx; mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_RECV_PARAMS, NULL); if (mtag == NULL) return (-1); rx = (struct ieee80211_rx_params *)(mtag + 1); memcpy(rxs, &rx->params, sizeof(*rxs)); return (0); } const struct ieee80211_rx_stats * ieee80211_get_rx_params_ptr(struct mbuf *m) { struct m_tag *mtag; struct ieee80211_rx_params *rx; mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_RECV_PARAMS, NULL); if (mtag == NULL) return (NULL); rx = (struct ieee80211_rx_params *)(mtag + 1); return (&rx->params); } /* * Add TOA parameters to the given mbuf. */ int ieee80211_add_toa_params(struct mbuf *m, const struct ieee80211_toa_params *p) { struct m_tag *mtag; struct ieee80211_toa_params *rp; mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_TOA_PARAMS, sizeof(struct ieee80211_toa_params), IEEE80211_M_NOWAIT); if (mtag == NULL) return (0); rp = (struct ieee80211_toa_params *)(mtag + 1); memcpy(rp, p, sizeof(*rp)); m_tag_prepend(m, mtag); return (1); } int ieee80211_get_toa_params(struct mbuf *m, struct ieee80211_toa_params *p) { struct m_tag *mtag; struct ieee80211_toa_params *rp; mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_TOA_PARAMS, NULL); if (mtag == NULL) return (0); rp = (struct ieee80211_toa_params *)(mtag + 1); if (p != NULL) memcpy(p, rp, sizeof(*p)); return (1); } /* * Transmit a frame to the parent interface. */ int ieee80211_parent_xmitpkt(struct ieee80211com *ic, struct mbuf *m) { int error; /* * Assert the IC TX lock is held - this enforces the * processing -> queuing order is maintained */ IEEE80211_TX_LOCK_ASSERT(ic); error = ic->ic_transmit(ic, m); if (error) { struct ieee80211_node *ni; ni = (struct ieee80211_node *)m->m_pkthdr.rcvif; /* XXX number of fragments */ if_inc_counter(ni->ni_vap->iv_ifp, IFCOUNTER_OERRORS, 1); ieee80211_free_node(ni); ieee80211_free_mbuf(m); } return (error); } /* * Transmit a frame to the VAP interface. */ int ieee80211_vap_xmitpkt(struct ieee80211vap *vap, struct mbuf *m) { struct ifnet *ifp = vap->iv_ifp; /* * When transmitting via the VAP, we shouldn't hold * any IC TX lock as the VAP TX path will acquire it. */ IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic); return (ifp->if_transmit(ifp, m)); } #include void net80211_get_random_bytes(void *p, size_t n) { uint8_t *dp = p; while (n > 0) { uint32_t v = arc4random(); size_t nb = n > sizeof(uint32_t) ? sizeof(uint32_t) : n; bcopy(&v, dp, n > sizeof(uint32_t) ? sizeof(uint32_t) : n); dp += sizeof(uint32_t), n -= nb; } } /* * Helper function for events that pass just a single mac address. */ static void notify_macaddr(struct ifnet *ifp, int op, const uint8_t mac[IEEE80211_ADDR_LEN]) { struct ieee80211_join_event iev; CURVNET_SET(ifp->if_vnet); memset(&iev, 0, sizeof(iev)); IEEE80211_ADDR_COPY(iev.iev_addr, mac); rt_ieee80211msg(ifp, op, &iev, sizeof(iev)); CURVNET_RESTORE(); } void ieee80211_notify_node_join(struct ieee80211_node *ni, int newassoc) { struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; CURVNET_SET_QUIET(ifp->if_vnet); IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode join", (ni == vap->iv_bss) ? "bss " : ""); if (ni == vap->iv_bss) { notify_macaddr(ifp, newassoc ? RTM_IEEE80211_ASSOC : RTM_IEEE80211_REASSOC, ni->ni_bssid); if_link_state_change(ifp, LINK_STATE_UP); } else { notify_macaddr(ifp, newassoc ? RTM_IEEE80211_JOIN : RTM_IEEE80211_REJOIN, ni->ni_macaddr); } CURVNET_RESTORE(); } void ieee80211_notify_node_leave(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; CURVNET_SET_QUIET(ifp->if_vnet); IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode leave", (ni == vap->iv_bss) ? "bss " : ""); if (ni == vap->iv_bss) { rt_ieee80211msg(ifp, RTM_IEEE80211_DISASSOC, NULL, 0); if_link_state_change(ifp, LINK_STATE_DOWN); } else { /* fire off wireless event station leaving */ notify_macaddr(ifp, RTM_IEEE80211_LEAVE, ni->ni_macaddr); } CURVNET_RESTORE(); } void ieee80211_notify_scan_done(struct ieee80211vap *vap) { struct ifnet *ifp = vap->iv_ifp; IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s\n", "notify scan done"); /* dispatch wireless event indicating scan completed */ CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_SCAN, NULL, 0); CURVNET_RESTORE(); } void ieee80211_notify_replay_failure(struct ieee80211vap *vap, const struct ieee80211_frame *wh, const struct ieee80211_key *k, u_int64_t rsc, int tid) { struct ifnet *ifp = vap->iv_ifp; IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, wh->i_addr2, "%s replay detected tid %d ", k->wk_cipher->ic_name, tid, (intmax_t) rsc, (intmax_t) rsc, (intmax_t) k->wk_keyrsc[tid], (intmax_t) k->wk_keyrsc[tid], k->wk_keyix, k->wk_rxkeyix); if (ifp != NULL) { /* NB: for cipher test modules */ struct ieee80211_replay_event iev; IEEE80211_ADDR_COPY(iev.iev_dst, wh->i_addr1); IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2); iev.iev_cipher = k->wk_cipher->ic_cipher; if (k->wk_rxkeyix != IEEE80211_KEYIX_NONE) iev.iev_keyix = k->wk_rxkeyix; else iev.iev_keyix = k->wk_keyix; iev.iev_keyrsc = k->wk_keyrsc[tid]; iev.iev_rsc = rsc; CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_REPLAY, &iev, sizeof(iev)); CURVNET_RESTORE(); } } void ieee80211_notify_michael_failure(struct ieee80211vap *vap, const struct ieee80211_frame *wh, u_int keyix) { struct ifnet *ifp = vap->iv_ifp; IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, wh->i_addr2, "michael MIC verification failed ", keyix); vap->iv_stats.is_rx_tkipmic++; if (ifp != NULL) { /* NB: for cipher test modules */ struct ieee80211_michael_event iev; IEEE80211_ADDR_COPY(iev.iev_dst, wh->i_addr1); IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2); iev.iev_cipher = IEEE80211_CIPHER_TKIP; iev.iev_keyix = keyix; CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_MICHAEL, &iev, sizeof(iev)); CURVNET_RESTORE(); } } void ieee80211_notify_wds_discover(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; notify_macaddr(ifp, RTM_IEEE80211_WDS, ni->ni_macaddr); } void ieee80211_notify_csa(struct ieee80211com *ic, const struct ieee80211_channel *c, int mode, int count) { struct ieee80211_csa_event iev; struct ieee80211vap *vap; struct ifnet *ifp; memset(&iev, 0, sizeof(iev)); iev.iev_flags = c->ic_flags; iev.iev_freq = c->ic_freq; iev.iev_ieee = c->ic_ieee; iev.iev_mode = mode; iev.iev_count = count; TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) { ifp = vap->iv_ifp; CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_CSA, &iev, sizeof(iev)); CURVNET_RESTORE(); } } void ieee80211_notify_radar(struct ieee80211com *ic, const struct ieee80211_channel *c) { struct ieee80211_radar_event iev; struct ieee80211vap *vap; struct ifnet *ifp; memset(&iev, 0, sizeof(iev)); iev.iev_flags = c->ic_flags; iev.iev_freq = c->ic_freq; iev.iev_ieee = c->ic_ieee; TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) { ifp = vap->iv_ifp; CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_RADAR, &iev, sizeof(iev)); CURVNET_RESTORE(); } } void ieee80211_notify_cac(struct ieee80211com *ic, const struct ieee80211_channel *c, enum ieee80211_notify_cac_event type) { struct ieee80211_cac_event iev; struct ieee80211vap *vap; struct ifnet *ifp; memset(&iev, 0, sizeof(iev)); iev.iev_flags = c->ic_flags; iev.iev_freq = c->ic_freq; iev.iev_ieee = c->ic_ieee; iev.iev_type = type; TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) { ifp = vap->iv_ifp; CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_CAC, &iev, sizeof(iev)); CURVNET_RESTORE(); } } void ieee80211_notify_node_deauth(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%s", "node deauth"); notify_macaddr(ifp, RTM_IEEE80211_DEAUTH, ni->ni_macaddr); } void ieee80211_notify_node_auth(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ifnet *ifp = vap->iv_ifp; IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%s", "node auth"); notify_macaddr(ifp, RTM_IEEE80211_AUTH, ni->ni_macaddr); } void ieee80211_notify_country(struct ieee80211vap *vap, const uint8_t bssid[IEEE80211_ADDR_LEN], const uint8_t cc[2]) { struct ifnet *ifp = vap->iv_ifp; struct ieee80211_country_event iev; memset(&iev, 0, sizeof(iev)); IEEE80211_ADDR_COPY(iev.iev_addr, bssid); iev.iev_cc[0] = cc[0]; iev.iev_cc[1] = cc[1]; CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_COUNTRY, &iev, sizeof(iev)); CURVNET_RESTORE(); } void ieee80211_notify_radio(struct ieee80211com *ic, int state) { struct ieee80211_radio_event iev; struct ieee80211vap *vap; struct ifnet *ifp; memset(&iev, 0, sizeof(iev)); iev.iev_state = state; TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) { ifp = vap->iv_ifp; CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_RADIO, &iev, sizeof(iev)); CURVNET_RESTORE(); } } void ieee80211_notify_ifnet_change(struct ieee80211vap *vap) { struct ifnet *ifp = vap->iv_ifp; IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG, "%s\n", "interface state change"); CURVNET_SET(ifp->if_vnet); rt_ifmsg(ifp); CURVNET_RESTORE(); } void ieee80211_load_module(const char *modname) { #ifdef notyet (void)kern_kldload(curthread, modname, NULL); #else printf("%s: load the %s module by hand for now.\n", __func__, modname); #endif } static eventhandler_tag wlan_bpfevent; static eventhandler_tag wlan_ifllevent; static void bpf_track(void *arg, struct ifnet *ifp, int dlt, int attach) { /* NB: identify vap's by if_init */ if (dlt == DLT_IEEE802_11_RADIO && ifp->if_init == ieee80211_init) { struct ieee80211vap *vap = ifp->if_softc; /* * Track bpf radiotap listener state. We mark the vap * to indicate if any listener is present and the com * to indicate if any listener exists on any associated * vap. This flag is used by drivers to prepare radiotap * state only when needed. */ if (attach) { ieee80211_syncflag_ext(vap, IEEE80211_FEXT_BPF); if (vap->iv_opmode == IEEE80211_M_MONITOR) atomic_add_int(&vap->iv_ic->ic_montaps, 1); } else if (!bpf_peers_present(vap->iv_rawbpf)) { ieee80211_syncflag_ext(vap, -IEEE80211_FEXT_BPF); if (vap->iv_opmode == IEEE80211_M_MONITOR) atomic_subtract_int(&vap->iv_ic->ic_montaps, 1); } } } /* * Change MAC address on the vap (if was not started). */ static void wlan_iflladdr(void *arg __unused, struct ifnet *ifp) { /* NB: identify vap's by if_init */ if (ifp->if_init == ieee80211_init && (ifp->if_flags & IFF_UP) == 0) { struct ieee80211vap *vap = ifp->if_softc; IEEE80211_ADDR_COPY(vap->iv_myaddr, IF_LLADDR(ifp)); } } /* * Fetch the VAP name. * * This returns a const char pointer suitable for debugging, * but don't expect it to stick around for much longer. */ const char * ieee80211_get_vap_ifname(struct ieee80211vap *vap) { if (vap->iv_ifp == NULL) return "(none)"; return vap->iv_ifp->if_xname; } #ifdef DEBUGNET static void ieee80211_debugnet_init(struct ifnet *ifp, int *nrxr, int *ncl, int *clsize) { struct ieee80211vap *vap; struct ieee80211com *ic; vap = if_getsoftc(ifp); ic = vap->iv_ic; IEEE80211_LOCK(ic); ic->ic_debugnet_meth->dn8_init(ic, nrxr, ncl, clsize); IEEE80211_UNLOCK(ic); } static void ieee80211_debugnet_event(struct ifnet *ifp, enum debugnet_ev ev) { struct ieee80211vap *vap; struct ieee80211com *ic; vap = if_getsoftc(ifp); ic = vap->iv_ic; IEEE80211_LOCK(ic); ic->ic_debugnet_meth->dn8_event(ic, ev); IEEE80211_UNLOCK(ic); } static int ieee80211_debugnet_transmit(struct ifnet *ifp, struct mbuf *m) { return (ieee80211_vap_transmit(ifp, m)); } static int ieee80211_debugnet_poll(struct ifnet *ifp, int count) { struct ieee80211vap *vap; struct ieee80211com *ic; vap = if_getsoftc(ifp); ic = vap->iv_ic; return (ic->ic_debugnet_meth->dn8_poll(ic, count)); } #endif /* * Module glue. * * NB: the module name is "wlan" for compatibility with NetBSD. */ static int wlan_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: if (bootverbose) printf("wlan: <802.11 Link Layer>\n"); wlan_bpfevent = EVENTHANDLER_REGISTER(bpf_track, bpf_track, 0, EVENTHANDLER_PRI_ANY); wlan_ifllevent = EVENTHANDLER_REGISTER(iflladdr_event, wlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); - wlan_cloner = if_clone_simple(wlanname, wlan_clone_create, - wlan_clone_destroy, 0); + struct if_clone_addreq req = { + .create_f = wlan_clone_create, + .destroy_f = wlan_clone_destroy, + .flags = IFC_F_AUTOUNIT, + }; + wlan_cloner = ifc_attach_cloner(wlanname, &req); return 0; case MOD_UNLOAD: - if_clone_detach(wlan_cloner); + ifc_detach_cloner(wlan_cloner); EVENTHANDLER_DEREGISTER(bpf_track, wlan_bpfevent); EVENTHANDLER_DEREGISTER(iflladdr_event, wlan_ifllevent); return 0; } return EINVAL; } static moduledata_t wlan_mod = { wlanname, wlan_modevent, 0 }; DECLARE_MODULE(wlan, wlan_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); MODULE_VERSION(wlan, 1); MODULE_DEPEND(wlan, ether, 1, 1, 1); #ifdef IEEE80211_ALQ MODULE_DEPEND(wlan, alq, 1, 1, 1); #endif /* IEEE80211_ALQ */ diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c index 261c9f2a4087..b4f34cd13fba 100644 --- a/sys/netpfil/pf/if_pflog.c +++ b/sys/netpfil/pf/if_pflog.c @@ -1,321 +1,336 @@ /*- * SPDX-License-Identifier: ISC * * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * This code was written by John Ioannidis for BSD/OS in Athens, Greece, * in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Copyright (C) 1995, 1996, 1997, 1998 by John Ioannidis, Angelos D. Keromytis * and Niels Provos. * Copyright (c) 2001, Angelos D. Keromytis, Niels Provos. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. * * $OpenBSD: if_pflog.c,v 1.26 2007/10/18 21:58:18 mpf Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_bpf.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #endif /* INET6 */ #ifdef INET #include #endif /* INET */ #define PFLOGMTU (32768 + MHLEN + MLEN) #ifdef PFLOGDEBUG #define DPRINTF(x) do { if (pflogdebug) printf x ; } while (0) #else #define DPRINTF(x) #endif static int pflogoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void pflogattach(int); static int pflogioctl(struct ifnet *, u_long, caddr_t); static void pflogstart(struct ifnet *); -static int pflog_clone_create(struct if_clone *, int, caddr_t); -static void pflog_clone_destroy(struct ifnet *); +static int pflog_clone_create(struct if_clone *, char *, size_t, + struct ifc_data *, struct ifnet **); +static int pflog_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static const char pflogname[] = "pflog"; VNET_DEFINE_STATIC(struct if_clone *, pflog_cloner); #define V_pflog_cloner VNET(pflog_cloner) VNET_DEFINE(struct ifnet *, pflogifs[PFLOGIFS_MAX]); /* for fast access */ #define V_pflogifs VNET(pflogifs) static void pflogattach(int npflog __unused) { int i; for (i = 0; i < PFLOGIFS_MAX; i++) V_pflogifs[i] = NULL; - V_pflog_cloner = if_clone_simple(pflogname, pflog_clone_create, - pflog_clone_destroy, 1); + + struct if_clone_addreq req = { + .create_f = pflog_clone_create, + .destroy_f = pflog_clone_destroy, + .flags = IFC_F_AUTOUNIT, + }; + V_pflog_cloner = ifc_attach_cloner(pflogname, &req); + struct ifc_data ifd = { .unit = 0 }; + ifc_create_ifp(pflogname, &ifd, NULL); } static int -pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param) +pflog_clone_create(struct if_clone *ifc, char *name, size_t maxlen, + struct ifc_data *ifd, struct ifnet **ifpp) { struct ifnet *ifp; - if (unit >= PFLOGIFS_MAX) + if (ifd->unit >= PFLOGIFS_MAX) return (EINVAL); ifp = if_alloc(IFT_PFLOG); if (ifp == NULL) { return (ENOSPC); } - if_initname(ifp, pflogname, unit); + if_initname(ifp, pflogname, ifd->unit); ifp->if_mtu = PFLOGMTU; ifp->if_ioctl = pflogioctl; ifp->if_output = pflogoutput; ifp->if_start = pflogstart; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_hdrlen = PFLOG_HDRLEN; if_attach(ifp); bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN); - V_pflogifs[unit] = ifp; + V_pflogifs[ifd->unit] = ifp; + *ifpp = ifp; return (0); } -static void -pflog_clone_destroy(struct ifnet *ifp) +static int +pflog_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { int i; + if (ifp->if_dunit == 0 && (flags & IFC_F_FORCE) == 0) + return (EINVAL); + for (i = 0; i < PFLOGIFS_MAX; i++) if (V_pflogifs[i] == ifp) V_pflogifs[i] = NULL; bpfdetach(ifp); if_detach(ifp); if_free(ifp); + + return (0); } /* * Start output on the pflog interface. */ static void pflogstart(struct ifnet *ifp) { struct mbuf *m; for (;;) { IF_LOCK(&ifp->if_snd); _IF_DEQUEUE(&ifp->if_snd, m); IF_UNLOCK(&ifp->if_snd); if (m == NULL) return; else m_freem(m); } } static int pflogoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *rt) { m_freem(m); return (0); } /* ARGSUSED */ static int pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { switch (cmd) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) ifp->if_drv_flags |= IFF_DRV_RUNNING; else ifp->if_drv_flags &= ~IFF_DRV_RUNNING; break; default: return (ENOTTY); } return (0); } static int pflog_packet(struct pfi_kkif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir, u_int8_t reason, struct pf_krule *rm, struct pf_krule *am, struct pf_kruleset *ruleset, struct pf_pdesc *pd, int lookupsafe) { struct ifnet *ifn; struct pfloghdr hdr; if (kif == NULL || m == NULL || rm == NULL || pd == NULL) return ( 1); if ((ifn = V_pflogifs[rm->logif]) == NULL || !ifn->if_bpf) return (0); bzero(&hdr, sizeof(hdr)); hdr.length = PFLOG_REAL_HDRLEN; hdr.af = af; hdr.action = rm->action; hdr.reason = reason; memcpy(hdr.ifname, kif->pfik_name, sizeof(hdr.ifname)); if (am == NULL) { hdr.rulenr = htonl(rm->nr); hdr.subrulenr = -1; } else { hdr.rulenr = htonl(am->nr); hdr.subrulenr = htonl(rm->nr); if (ruleset != NULL && ruleset->anchor != NULL) strlcpy(hdr.ruleset, ruleset->anchor->name, sizeof(hdr.ruleset)); } hdr.ridentifier = htonl(rm->ridentifier); /* * XXXGL: we avoid pf_socket_lookup() when we are holding * state lock, since this leads to unsafe LOR. * These conditions are very very rare, however. */ if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe) pd->lookup.done = pf_socket_lookup(dir, pd, m); if (pd->lookup.done > 0) hdr.uid = pd->lookup.uid; else hdr.uid = UID_MAX; hdr.pid = NO_PID; hdr.rule_uid = rm->cuid; hdr.rule_pid = rm->cpid; hdr.dir = dir; #ifdef INET if (af == AF_INET && dir == PF_OUT) { struct ip *ip; ip = mtod(m, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(m, ip->ip_hl << 2); } #endif /* INET */ if_inc_counter(ifn, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifn, IFCOUNTER_OBYTES, m->m_pkthdr.len); BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m); return (0); } static void vnet_pflog_init(const void *unused __unused) { pflogattach(1); } VNET_SYSINIT(vnet_pflog_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY, vnet_pflog_init, NULL); static void vnet_pflog_uninit(const void *unused __unused) { - if_clone_detach(V_pflog_cloner); + ifc_detach_cloner(V_pflog_cloner); } /* * Detach after pf is gone; otherwise we might touch pflog memory * from within pf after freeing pflog. */ VNET_SYSUNINIT(vnet_pflog_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_pflog_uninit, NULL); static int pflog_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: PF_RULES_WLOCK(); pflog_packet_ptr = pflog_packet; PF_RULES_WUNLOCK(); break; case MOD_UNLOAD: PF_RULES_WLOCK(); pflog_packet_ptr = NULL; PF_RULES_WUNLOCK(); break; default: error = EOPNOTSUPP; break; } return error; } static moduledata_t pflog_mod = { pflogname, pflog_modevent, 0 }; #define PFLOG_MODVER 1 /* Do not run before pf is initialized as we depend on its locks. */ DECLARE_MODULE(pflog, pflog_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); MODULE_VERSION(pflog, PFLOG_MODVER); MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER);