diff --git a/sys/dev/iicbus/if_ic.c b/sys/dev/iicbus/if_ic.c index 4ca8f3960298..52ab5afb9c4e 100644 --- a/sys/dev/iicbus/if_ic.c +++ b/sys/dev/iicbus/if_ic.c @@ -1,435 +1,435 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1998, 2001 Nicolas Souchu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * I2C bus IP driver */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iicbus_if.h" #define PCF_MASTER_ADDRESS 0xaa #define ICHDRLEN sizeof(u_int32_t) #define ICMTU 1500 /* default mtu */ struct ic_softc { if_t ic_ifp; device_t ic_dev; u_char ic_addr; /* peer I2C address */ int ic_flags; char *ic_obuf; char *ic_ifbuf; char *ic_cp; int ic_xfercnt; int ic_iferrs; struct mtx ic_lock; }; #define IC_SENDING 0x0001 #define IC_OBUF_BUSY 0x0002 #define IC_IFBUF_BUSY 0x0004 #define IC_BUFFERS_BUSY (IC_OBUF_BUSY | IC_IFBUF_BUSY) #define IC_BUFFER_WAITER 0x0004 static int icprobe(device_t); static int icattach(device_t); static int icioctl(if_t, u_long, caddr_t); static int icoutput(if_t, struct mbuf *, const struct sockaddr *, struct route *); static int icintr(device_t, int, char *); static device_method_t ic_methods[] = { /* device interface */ DEVMETHOD(device_probe, icprobe), DEVMETHOD(device_attach, icattach), /* iicbus interface */ DEVMETHOD(iicbus_intr, icintr), { 0, 0 } }; static driver_t ic_driver = { "ic", ic_methods, sizeof(struct ic_softc), }; static void ic_alloc_buffers(struct ic_softc *sc, int mtu) { char *obuf, *ifbuf; obuf = malloc(mtu + ICHDRLEN, M_DEVBUF, M_WAITOK); ifbuf = malloc(mtu + ICHDRLEN, M_DEVBUF, M_WAITOK); mtx_lock(&sc->ic_lock); while (sc->ic_flags & IC_BUFFERS_BUSY) { sc->ic_flags |= IC_BUFFER_WAITER; mtx_sleep(sc, &sc->ic_lock, 0, "icalloc", 0); sc->ic_flags &= ~IC_BUFFER_WAITER; } free(sc->ic_obuf, M_DEVBUF); free(sc->ic_ifbuf, M_DEVBUF); sc->ic_obuf = obuf; sc->ic_ifbuf = ifbuf; if_setmtu(sc->ic_ifp, mtu); mtx_unlock(&sc->ic_lock); } /* * icprobe() */ static int icprobe(device_t dev) { return (BUS_PROBE_NOWILDCARD); } /* * icattach() */ static int icattach(device_t dev) { struct ic_softc *sc = (struct ic_softc *)device_get_softc(dev); if_t ifp; ifp = sc->ic_ifp = if_alloc(IFT_PARA); if (ifp == NULL) return (ENOSPC); mtx_init(&sc->ic_lock, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); sc->ic_addr = PCF_MASTER_ADDRESS; /* XXX only PCF masters */ sc->ic_dev = dev; if_setsoftc(ifp, sc); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setflags(ifp, IFF_SIMPLEX | IFF_POINTOPOINT | IFF_MULTICAST); if_setioctlfn(ifp, icioctl); if_setoutputfn(ifp, icoutput); if_setifheaderlen(ifp, 0); if_setsendqlen(ifp, ifqmaxlen); ic_alloc_buffers(sc, ICMTU); if_attach(ifp); bpfattach(ifp, DLT_NULL, ICHDRLEN); return (0); } /* * iciotcl() */ static int icioctl(if_t ifp, u_long cmd, caddr_t data) { struct ic_softc *sc = if_getsoftc(ifp); device_t icdev = sc->ic_dev; device_t parent = device_get_parent(icdev); struct ifaddr *ifa = (struct ifaddr *)data; struct ifreq *ifr = (struct ifreq *)data; int error; switch (cmd) { case SIOCAIFADDR: case SIOCSIFADDR: if (ifa->ifa_addr->sa_family != AF_INET) return (EAFNOSUPPORT); mtx_lock(&sc->ic_lock); if_setflagbits(ifp, IFF_UP, 0); goto locked; case SIOCSIFFLAGS: mtx_lock(&sc->ic_lock); locked: if ((!(if_getflags(ifp) & IFF_UP)) && (if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { /* XXX disable PCF */ if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); mtx_unlock(&sc->ic_lock); /* IFF_UP is not set, try to release the bus anyway */ iicbus_release_bus(parent, icdev); break; } if (((if_getflags(ifp) & IFF_UP)) && (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))) { mtx_unlock(&sc->ic_lock); if ((error = iicbus_request_bus(parent, icdev, IIC_WAIT | IIC_INTR))) return (error); mtx_lock(&sc->ic_lock); iicbus_reset(parent, IIC_FASTEST, 0, NULL); if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); } mtx_unlock(&sc->ic_lock); break; case SIOCSIFMTU: ic_alloc_buffers(sc, ifr->ifr_mtu); break; case SIOCGIFMTU: mtx_lock(&sc->ic_lock); ifr->ifr_mtu = if_getmtu(sc->ic_ifp); mtx_unlock(&sc->ic_lock); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) return (EAFNOSUPPORT); /* XXX */ switch (ifr->ifr_addr.sa_family) { case AF_INET: break; default: return (EAFNOSUPPORT); } break; default: return (EINVAL); } return (0); } /* * icintr() */ static int icintr(device_t dev, int event, char *ptr) { struct ic_softc *sc = (struct ic_softc *)device_get_softc(dev); struct mbuf *top; int len; mtx_lock(&sc->ic_lock); switch (event) { case INTR_GENERAL: case INTR_START: sc->ic_cp = sc->ic_ifbuf; sc->ic_xfercnt = 0; sc->ic_flags |= IC_IFBUF_BUSY; break; case INTR_STOP: /* if any error occurred during transfert, * drop the packet */ sc->ic_flags &= ~IC_IFBUF_BUSY; if ((sc->ic_flags & (IC_BUFFERS_BUSY | IC_BUFFER_WAITER)) == IC_BUFFER_WAITER) wakeup(&sc); if (sc->ic_iferrs) goto err; if ((len = sc->ic_xfercnt) == 0) break; /* ignore */ if (len <= ICHDRLEN) goto err; len -= ICHDRLEN; if_inc_counter(sc->ic_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(sc->ic_ifp, IFCOUNTER_IBYTES, len); BPF_TAP(sc->ic_ifp, sc->ic_ifbuf, len + ICHDRLEN); top = m_devget(sc->ic_ifbuf + ICHDRLEN, len, 0, sc->ic_ifp, 0); if (top) { struct epoch_tracker et; mtx_unlock(&sc->ic_lock); M_SETFIB(top, if_getfib(sc->ic_ifp)); NET_EPOCH_ENTER(et); netisr_dispatch(NETISR_IP, top); NET_EPOCH_EXIT(et); mtx_lock(&sc->ic_lock); } break; err: if_printf(sc->ic_ifp, "errors (%d)!\n", sc->ic_iferrs); sc->ic_iferrs = 0; /* reset error count */ if_inc_counter(sc->ic_ifp, IFCOUNTER_IERRORS, 1); break; case INTR_RECEIVE: if (sc->ic_xfercnt >= if_getmtu(sc->ic_ifp) + ICHDRLEN) { sc->ic_iferrs++; } else { *sc->ic_cp++ = *ptr; sc->ic_xfercnt++; } break; case INTR_NOACK: /* xfer terminated by master */ break; case INTR_TRANSMIT: *ptr = 0xff; /* XXX */ break; case INTR_ERROR: sc->ic_iferrs++; break; default: panic("%s: unknown event (%d)!", __func__, event); } mtx_unlock(&sc->ic_lock); return (0); } /* * icoutput() */ static int icoutput(if_t ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ic_softc *sc = if_getsoftc(ifp); device_t icdev = sc->ic_dev; device_t parent = device_get_parent(icdev); int len, sent; struct mbuf *mm; u_char *cp; u_int32_t hdr; - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &hdr, sizeof(hdr)); else hdr = RO_GET_FAMILY(ro, dst); mtx_lock(&sc->ic_lock); if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); /* already sending? */ if (sc->ic_flags & IC_SENDING) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); goto error; } /* insert header */ bcopy ((char *)&hdr, sc->ic_obuf, ICHDRLEN); cp = sc->ic_obuf + ICHDRLEN; len = 0; mm = m; do { if (len + mm->m_len > if_getmtu(sc->ic_ifp)) { /* packet too large */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); goto error; } bcopy(mtod(mm,char *), cp, mm->m_len); cp += mm->m_len; len += mm->m_len; } while ((mm = mm->m_next)); BPF_MTAP2(ifp, &hdr, sizeof(hdr), m); sc->ic_flags |= (IC_SENDING | IC_OBUF_BUSY); m_freem(m); mtx_unlock(&sc->ic_lock); /* send the packet */ if (iicbus_block_write(parent, sc->ic_addr, sc->ic_obuf, len + ICHDRLEN, &sent)) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); } mtx_lock(&sc->ic_lock); sc->ic_flags &= ~(IC_SENDING | IC_OBUF_BUSY); if ((sc->ic_flags & (IC_BUFFERS_BUSY | IC_BUFFER_WAITER)) == IC_BUFFER_WAITER) wakeup(&sc); mtx_unlock(&sc->ic_lock); return (0); error: m_freem(m); mtx_unlock(&sc->ic_lock); return(0); } DRIVER_MODULE(ic, iicbus, ic_driver, 0, 0); MODULE_DEPEND(ic, iicbus, IICBUS_MINVER, IICBUS_PREFVER, IICBUS_MAXVER); MODULE_VERSION(ic, 1); diff --git a/sys/dev/wg/if_wg.c b/sys/dev/wg/if_wg.c index 2c867956912a..30429c3725cd 100644 --- a/sys/dev/wg/if_wg.c +++ b/sys/dev/wg/if_wg.c @@ -1,3056 +1,3057 @@ /* SPDX-License-Identifier: ISC * * Copyright (C) 2015-2021 Jason A. Donenfeld . All Rights Reserved. * Copyright (C) 2019-2021 Matt Dunwoodie * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) * Copyright (c) 2021 Kyle Evans * Copyright (c) 2022 The FreeBSD Foundation */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "wg_noise.h" #include "wg_cookie.h" #include "version.h" #include "if_wg.h" #define DEFAULT_MTU (ETHERMTU - 80) #define MAX_MTU (IF_MAXMTU - 80) #define MAX_STAGED_PKT 128 #define MAX_QUEUED_PKT 1024 #define MAX_QUEUED_PKT_MASK (MAX_QUEUED_PKT - 1) #define MAX_QUEUED_HANDSHAKES 4096 #define REKEY_TIMEOUT_JITTER 334 /* 1/3 sec, round for arc4random_uniform */ #define MAX_TIMER_HANDSHAKES (90 / REKEY_TIMEOUT) #define NEW_HANDSHAKE_TIMEOUT (REKEY_TIMEOUT + KEEPALIVE_TIMEOUT) #define UNDERLOAD_TIMEOUT 1 #define DPRINTF(sc, ...) if (if_getflags(sc->sc_ifp) & IFF_DEBUG) if_printf(sc->sc_ifp, ##__VA_ARGS__) /* First byte indicating packet type on the wire */ #define WG_PKT_INITIATION htole32(1) #define WG_PKT_RESPONSE htole32(2) #define WG_PKT_COOKIE htole32(3) #define WG_PKT_DATA htole32(4) #define WG_PKT_PADDING 16 #define WG_KEY_SIZE 32 struct wg_pkt_initiation { uint32_t t; uint32_t s_idx; uint8_t ue[NOISE_PUBLIC_KEY_LEN]; uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN]; uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN]; struct cookie_macs m; }; struct wg_pkt_response { uint32_t t; uint32_t s_idx; uint32_t r_idx; uint8_t ue[NOISE_PUBLIC_KEY_LEN]; uint8_t en[0 + NOISE_AUTHTAG_LEN]; struct cookie_macs m; }; struct wg_pkt_cookie { uint32_t t; uint32_t r_idx; uint8_t nonce[COOKIE_NONCE_SIZE]; uint8_t ec[COOKIE_ENCRYPTED_SIZE]; }; struct wg_pkt_data { uint32_t t; uint32_t r_idx; uint64_t nonce; uint8_t buf[]; }; struct wg_endpoint { union { struct sockaddr r_sa; struct sockaddr_in r_sin; #ifdef INET6 struct sockaddr_in6 r_sin6; #endif } e_remote; union { struct in_addr l_in; #ifdef INET6 struct in6_pktinfo l_pktinfo6; #define l_in6 l_pktinfo6.ipi6_addr #endif } e_local; }; struct aip_addr { uint8_t length; union { uint8_t bytes[16]; uint32_t ip; uint32_t ip6[4]; struct in_addr in; struct in6_addr in6; }; }; struct wg_aip { struct radix_node a_nodes[2]; LIST_ENTRY(wg_aip) a_entry; struct aip_addr a_addr; struct aip_addr a_mask; struct wg_peer *a_peer; sa_family_t a_af; }; struct wg_packet { STAILQ_ENTRY(wg_packet) p_serial; STAILQ_ENTRY(wg_packet) p_parallel; struct wg_endpoint p_endpoint; struct noise_keypair *p_keypair; uint64_t p_nonce; struct mbuf *p_mbuf; int p_mtu; sa_family_t p_af; enum wg_ring_state { WG_PACKET_UNCRYPTED, WG_PACKET_CRYPTED, WG_PACKET_DEAD, } p_state; }; STAILQ_HEAD(wg_packet_list, wg_packet); struct wg_queue { struct mtx q_mtx; struct wg_packet_list q_queue; size_t q_len; }; struct wg_peer { TAILQ_ENTRY(wg_peer) p_entry; uint64_t p_id; struct wg_softc *p_sc; struct noise_remote *p_remote; struct cookie_maker p_cookie; struct rwlock p_endpoint_lock; struct wg_endpoint p_endpoint; struct wg_queue p_stage_queue; struct wg_queue p_encrypt_serial; struct wg_queue p_decrypt_serial; bool p_enabled; bool p_need_another_keepalive; uint16_t p_persistent_keepalive_interval; struct callout p_new_handshake; struct callout p_send_keepalive; struct callout p_retry_handshake; struct callout p_zero_key_material; struct callout p_persistent_keepalive; struct mtx p_handshake_mtx; struct timespec p_handshake_complete; /* nanotime */ int p_handshake_retries; struct grouptask p_send; struct grouptask p_recv; counter_u64_t p_tx_bytes; counter_u64_t p_rx_bytes; LIST_HEAD(, wg_aip) p_aips; size_t p_aips_num; }; struct wg_socket { struct socket *so_so4; struct socket *so_so6; uint32_t so_user_cookie; int so_fibnum; in_port_t so_port; }; struct wg_softc { LIST_ENTRY(wg_softc) sc_entry; if_t sc_ifp; int sc_flags; struct ucred *sc_ucred; struct wg_socket sc_socket; TAILQ_HEAD(,wg_peer) sc_peers; size_t sc_peers_num; struct noise_local *sc_local; struct cookie_checker sc_cookie; struct radix_node_head *sc_aip4; struct radix_node_head *sc_aip6; struct grouptask sc_handshake; struct wg_queue sc_handshake_queue; struct grouptask *sc_encrypt; struct grouptask *sc_decrypt; struct wg_queue sc_encrypt_parallel; struct wg_queue sc_decrypt_parallel; u_int sc_encrypt_last_cpu; u_int sc_decrypt_last_cpu; struct sx sc_lock; }; #define WGF_DYING 0x0001 #define MAX_LOOPS 8 #define MTAG_WGLOOP 0x77676c70 /* wglp */ #define GROUPTASK_DRAIN(gtask) \ gtaskqueue_drain((gtask)->gt_taskqueue, &(gtask)->gt_task) #define BPF_MTAP2_AF(ifp, m, af) do { \ uint32_t __bpf_tap_af = (af); \ BPF_MTAP2(ifp, &__bpf_tap_af, sizeof(__bpf_tap_af), m); \ } while (0) static int clone_count; static uma_zone_t wg_packet_zone; static volatile unsigned long peer_counter = 0; static const char wgname[] = "wg"; static unsigned wg_osd_jail_slot; static struct sx wg_sx; SX_SYSINIT(wg_sx, &wg_sx, "wg_sx"); static LIST_HEAD(, wg_softc) wg_list = LIST_HEAD_INITIALIZER(wg_list); static TASKQGROUP_DEFINE(wg_tqg, mp_ncpus, 1); MALLOC_DEFINE(M_WG, "WG", "wireguard"); VNET_DEFINE_STATIC(struct if_clone *, wg_cloner); #define V_wg_cloner VNET(wg_cloner) #define WG_CAPS IFCAP_LINKSTATE struct wg_timespec64 { uint64_t tv_sec; uint64_t tv_nsec; }; static int wg_socket_init(struct wg_softc *, in_port_t); static int wg_socket_bind(struct socket **, struct socket **, in_port_t *); static void wg_socket_set(struct wg_softc *, struct socket *, struct socket *); static void wg_socket_uninit(struct wg_softc *); static int wg_socket_set_sockopt(struct socket *, struct socket *, int, void *, size_t); static int wg_socket_set_cookie(struct wg_softc *, uint32_t); static int wg_socket_set_fibnum(struct wg_softc *, int); static int wg_send(struct wg_softc *, struct wg_endpoint *, struct mbuf *); static void wg_timers_enable(struct wg_peer *); static void wg_timers_disable(struct wg_peer *); static void wg_timers_set_persistent_keepalive(struct wg_peer *, uint16_t); static void wg_timers_get_last_handshake(struct wg_peer *, struct wg_timespec64 *); static void wg_timers_event_data_sent(struct wg_peer *); static void wg_timers_event_data_received(struct wg_peer *); static void wg_timers_event_any_authenticated_packet_sent(struct wg_peer *); static void wg_timers_event_any_authenticated_packet_received(struct wg_peer *); static void wg_timers_event_any_authenticated_packet_traversal(struct wg_peer *); static void wg_timers_event_handshake_initiated(struct wg_peer *); static void wg_timers_event_handshake_complete(struct wg_peer *); static void wg_timers_event_session_derived(struct wg_peer *); static void wg_timers_event_want_initiation(struct wg_peer *); static void wg_timers_run_send_initiation(struct wg_peer *, bool); static void wg_timers_run_retry_handshake(void *); static void wg_timers_run_send_keepalive(void *); static void wg_timers_run_new_handshake(void *); static void wg_timers_run_zero_key_material(void *); static void wg_timers_run_persistent_keepalive(void *); static int wg_aip_add(struct wg_softc *, struct wg_peer *, sa_family_t, const void *, uint8_t); static struct wg_peer *wg_aip_lookup(struct wg_softc *, sa_family_t, void *); static void wg_aip_remove_all(struct wg_softc *, struct wg_peer *); static struct wg_peer *wg_peer_alloc(struct wg_softc *, const uint8_t [WG_KEY_SIZE]); static void wg_peer_free_deferred(struct noise_remote *); static void wg_peer_destroy(struct wg_peer *); static void wg_peer_destroy_all(struct wg_softc *); static void wg_peer_send_buf(struct wg_peer *, uint8_t *, size_t); static void wg_send_initiation(struct wg_peer *); static void wg_send_response(struct wg_peer *); static void wg_send_cookie(struct wg_softc *, struct cookie_macs *, uint32_t, struct wg_endpoint *); static void wg_peer_set_endpoint(struct wg_peer *, struct wg_endpoint *); static void wg_peer_clear_src(struct wg_peer *); static void wg_peer_get_endpoint(struct wg_peer *, struct wg_endpoint *); static void wg_send_buf(struct wg_softc *, struct wg_endpoint *, uint8_t *, size_t); static void wg_send_keepalive(struct wg_peer *); static void wg_handshake(struct wg_softc *, struct wg_packet *); static void wg_encrypt(struct wg_softc *, struct wg_packet *); static void wg_decrypt(struct wg_softc *, struct wg_packet *); static void wg_softc_handshake_receive(struct wg_softc *); static void wg_softc_decrypt(struct wg_softc *); static void wg_softc_encrypt(struct wg_softc *); static void wg_encrypt_dispatch(struct wg_softc *); static void wg_decrypt_dispatch(struct wg_softc *); static void wg_deliver_out(struct wg_peer *); static void wg_deliver_in(struct wg_peer *); static struct wg_packet *wg_packet_alloc(struct mbuf *); static void wg_packet_free(struct wg_packet *); static void wg_queue_init(struct wg_queue *, const char *); static void wg_queue_deinit(struct wg_queue *); static size_t wg_queue_len(struct wg_queue *); static int wg_queue_enqueue_handshake(struct wg_queue *, struct wg_packet *); static struct wg_packet *wg_queue_dequeue_handshake(struct wg_queue *); static void wg_queue_push_staged(struct wg_queue *, struct wg_packet *); static void wg_queue_enlist_staged(struct wg_queue *, struct wg_packet_list *); static void wg_queue_delist_staged(struct wg_queue *, struct wg_packet_list *); static void wg_queue_purge(struct wg_queue *); static int wg_queue_both(struct wg_queue *, struct wg_queue *, struct wg_packet *); static struct wg_packet *wg_queue_dequeue_serial(struct wg_queue *); static struct wg_packet *wg_queue_dequeue_parallel(struct wg_queue *); static bool wg_input(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); static void wg_peer_send_staged(struct wg_peer *); static int wg_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, if_t *ifpp); static void wg_qflush(if_t); static inline int determine_af_and_pullup(struct mbuf **m, sa_family_t *af); static int wg_xmit(if_t, struct mbuf *, sa_family_t, uint32_t); static int wg_transmit(if_t, struct mbuf *); static int wg_output(if_t, struct mbuf *, const struct sockaddr *, struct route *); static int wg_clone_destroy(struct if_clone *ifc, if_t ifp, uint32_t flags); static bool wgc_privileged(struct wg_softc *); static int wgc_get(struct wg_softc *, struct wg_data_io *); static int wgc_set(struct wg_softc *, struct wg_data_io *); static int wg_up(struct wg_softc *); static void wg_down(struct wg_softc *); static void wg_reassign(if_t, struct vnet *, char *unused); static void wg_init(void *); static int wg_ioctl(if_t, u_long, caddr_t); static void vnet_wg_init(const void *); static void vnet_wg_uninit(const void *); static int wg_module_init(void); static void wg_module_deinit(void); /* TODO Peer */ static struct wg_peer * wg_peer_alloc(struct wg_softc *sc, const uint8_t pub_key[WG_KEY_SIZE]) { struct wg_peer *peer; sx_assert(&sc->sc_lock, SX_XLOCKED); peer = malloc(sizeof(*peer), M_WG, M_WAITOK | M_ZERO); peer->p_remote = noise_remote_alloc(sc->sc_local, peer, pub_key); peer->p_tx_bytes = counter_u64_alloc(M_WAITOK); peer->p_rx_bytes = counter_u64_alloc(M_WAITOK); peer->p_id = peer_counter++; peer->p_sc = sc; cookie_maker_init(&peer->p_cookie, pub_key); rw_init(&peer->p_endpoint_lock, "wg_peer_endpoint"); wg_queue_init(&peer->p_stage_queue, "stageq"); wg_queue_init(&peer->p_encrypt_serial, "txq"); wg_queue_init(&peer->p_decrypt_serial, "rxq"); peer->p_enabled = false; peer->p_need_another_keepalive = false; peer->p_persistent_keepalive_interval = 0; callout_init(&peer->p_new_handshake, true); callout_init(&peer->p_send_keepalive, true); callout_init(&peer->p_retry_handshake, true); callout_init(&peer->p_persistent_keepalive, true); callout_init(&peer->p_zero_key_material, true); mtx_init(&peer->p_handshake_mtx, "peer handshake", NULL, MTX_DEF); bzero(&peer->p_handshake_complete, sizeof(peer->p_handshake_complete)); peer->p_handshake_retries = 0; GROUPTASK_INIT(&peer->p_send, 0, (gtask_fn_t *)wg_deliver_out, peer); taskqgroup_attach(qgroup_wg_tqg, &peer->p_send, peer, NULL, NULL, "wg send"); GROUPTASK_INIT(&peer->p_recv, 0, (gtask_fn_t *)wg_deliver_in, peer); taskqgroup_attach(qgroup_wg_tqg, &peer->p_recv, peer, NULL, NULL, "wg recv"); LIST_INIT(&peer->p_aips); peer->p_aips_num = 0; return (peer); } static void wg_peer_free_deferred(struct noise_remote *r) { struct wg_peer *peer = noise_remote_arg(r); /* While there are no references remaining, we may still have * p_{send,recv} executing (think empty queue, but wg_deliver_{in,out} * needs to check the queue. We should wait for them and then free. */ GROUPTASK_DRAIN(&peer->p_recv); GROUPTASK_DRAIN(&peer->p_send); taskqgroup_detach(qgroup_wg_tqg, &peer->p_recv); taskqgroup_detach(qgroup_wg_tqg, &peer->p_send); wg_queue_deinit(&peer->p_decrypt_serial); wg_queue_deinit(&peer->p_encrypt_serial); wg_queue_deinit(&peer->p_stage_queue); counter_u64_free(peer->p_tx_bytes); counter_u64_free(peer->p_rx_bytes); rw_destroy(&peer->p_endpoint_lock); mtx_destroy(&peer->p_handshake_mtx); cookie_maker_free(&peer->p_cookie); free(peer, M_WG); } static void wg_peer_destroy(struct wg_peer *peer) { struct wg_softc *sc = peer->p_sc; sx_assert(&sc->sc_lock, SX_XLOCKED); /* Disable remote and timers. This will prevent any new handshakes * occuring. */ noise_remote_disable(peer->p_remote); wg_timers_disable(peer); /* Now we can remove all allowed IPs so no more packets will be routed * to the peer. */ wg_aip_remove_all(sc, peer); /* Remove peer from the interface, then free. Some references may still * exist to p_remote, so noise_remote_free will wait until they're all * put to call wg_peer_free_deferred. */ sc->sc_peers_num--; TAILQ_REMOVE(&sc->sc_peers, peer, p_entry); DPRINTF(sc, "Peer %" PRIu64 " destroyed\n", peer->p_id); noise_remote_free(peer->p_remote, wg_peer_free_deferred); } static void wg_peer_destroy_all(struct wg_softc *sc) { struct wg_peer *peer, *tpeer; TAILQ_FOREACH_SAFE(peer, &sc->sc_peers, p_entry, tpeer) wg_peer_destroy(peer); } static void wg_peer_set_endpoint(struct wg_peer *peer, struct wg_endpoint *e) { MPASS(e->e_remote.r_sa.sa_family != 0); if (memcmp(e, &peer->p_endpoint, sizeof(*e)) == 0) return; rw_wlock(&peer->p_endpoint_lock); peer->p_endpoint = *e; rw_wunlock(&peer->p_endpoint_lock); } static void wg_peer_clear_src(struct wg_peer *peer) { rw_wlock(&peer->p_endpoint_lock); bzero(&peer->p_endpoint.e_local, sizeof(peer->p_endpoint.e_local)); rw_wunlock(&peer->p_endpoint_lock); } static void wg_peer_get_endpoint(struct wg_peer *peer, struct wg_endpoint *e) { rw_rlock(&peer->p_endpoint_lock); *e = peer->p_endpoint; rw_runlock(&peer->p_endpoint_lock); } /* Allowed IP */ static int wg_aip_add(struct wg_softc *sc, struct wg_peer *peer, sa_family_t af, const void *addr, uint8_t cidr) { struct radix_node_head *root; struct radix_node *node; struct wg_aip *aip; int ret = 0; aip = malloc(sizeof(*aip), M_WG, M_WAITOK | M_ZERO); aip->a_peer = peer; aip->a_af = af; switch (af) { #ifdef INET case AF_INET: if (cidr > 32) cidr = 32; root = sc->sc_aip4; aip->a_addr.in = *(const struct in_addr *)addr; aip->a_mask.ip = htonl(~((1LL << (32 - cidr)) - 1) & 0xffffffff); aip->a_addr.ip &= aip->a_mask.ip; aip->a_addr.length = aip->a_mask.length = offsetof(struct aip_addr, in) + sizeof(struct in_addr); break; #endif #ifdef INET6 case AF_INET6: if (cidr > 128) cidr = 128; root = sc->sc_aip6; aip->a_addr.in6 = *(const struct in6_addr *)addr; in6_prefixlen2mask(&aip->a_mask.in6, cidr); for (int i = 0; i < 4; i++) aip->a_addr.ip6[i] &= aip->a_mask.ip6[i]; aip->a_addr.length = aip->a_mask.length = offsetof(struct aip_addr, in6) + sizeof(struct in6_addr); break; #endif default: free(aip, M_WG); return (EAFNOSUPPORT); } RADIX_NODE_HEAD_LOCK(root); node = root->rnh_addaddr(&aip->a_addr, &aip->a_mask, &root->rh, aip->a_nodes); if (node == aip->a_nodes) { LIST_INSERT_HEAD(&peer->p_aips, aip, a_entry); peer->p_aips_num++; } else if (!node) node = root->rnh_lookup(&aip->a_addr, &aip->a_mask, &root->rh); if (!node) { free(aip, M_WG); ret = ENOMEM; } else if (node != aip->a_nodes) { free(aip, M_WG); aip = (struct wg_aip *)node; if (aip->a_peer != peer) { LIST_REMOVE(aip, a_entry); aip->a_peer->p_aips_num--; aip->a_peer = peer; LIST_INSERT_HEAD(&peer->p_aips, aip, a_entry); aip->a_peer->p_aips_num++; } } RADIX_NODE_HEAD_UNLOCK(root); return (ret); } static struct wg_peer * wg_aip_lookup(struct wg_softc *sc, sa_family_t af, void *a) { struct radix_node_head *root; struct radix_node *node; struct wg_peer *peer; struct aip_addr addr; RADIX_NODE_HEAD_RLOCK_TRACKER; switch (af) { case AF_INET: root = sc->sc_aip4; memcpy(&addr.in, a, sizeof(addr.in)); addr.length = offsetof(struct aip_addr, in) + sizeof(struct in_addr); break; case AF_INET6: root = sc->sc_aip6; memcpy(&addr.in6, a, sizeof(addr.in6)); addr.length = offsetof(struct aip_addr, in6) + sizeof(struct in6_addr); break; default: return NULL; } RADIX_NODE_HEAD_RLOCK(root); node = root->rnh_matchaddr(&addr, &root->rh); if (node != NULL) { peer = ((struct wg_aip *)node)->a_peer; noise_remote_ref(peer->p_remote); } else { peer = NULL; } RADIX_NODE_HEAD_RUNLOCK(root); return (peer); } static void wg_aip_remove_all(struct wg_softc *sc, struct wg_peer *peer) { struct wg_aip *aip, *taip; RADIX_NODE_HEAD_LOCK(sc->sc_aip4); LIST_FOREACH_SAFE(aip, &peer->p_aips, a_entry, taip) { if (aip->a_af == AF_INET) { if (sc->sc_aip4->rnh_deladdr(&aip->a_addr, &aip->a_mask, &sc->sc_aip4->rh) == NULL) panic("failed to delete aip %p", aip); LIST_REMOVE(aip, a_entry); peer->p_aips_num--; free(aip, M_WG); } } RADIX_NODE_HEAD_UNLOCK(sc->sc_aip4); RADIX_NODE_HEAD_LOCK(sc->sc_aip6); LIST_FOREACH_SAFE(aip, &peer->p_aips, a_entry, taip) { if (aip->a_af == AF_INET6) { if (sc->sc_aip6->rnh_deladdr(&aip->a_addr, &aip->a_mask, &sc->sc_aip6->rh) == NULL) panic("failed to delete aip %p", aip); LIST_REMOVE(aip, a_entry); peer->p_aips_num--; free(aip, M_WG); } } RADIX_NODE_HEAD_UNLOCK(sc->sc_aip6); if (!LIST_EMPTY(&peer->p_aips) || peer->p_aips_num != 0) panic("wg_aip_remove_all could not delete all %p", peer); } static int wg_socket_init(struct wg_softc *sc, in_port_t port) { struct ucred *cred = sc->sc_ucred; struct socket *so4 = NULL, *so6 = NULL; int rc; sx_assert(&sc->sc_lock, SX_XLOCKED); if (!cred) return (EBUSY); /* * For socket creation, we use the creds of the thread that created the * tunnel rather than the current thread to maintain the semantics that * WireGuard has on Linux with network namespaces -- that the sockets * are created in their home vnet so that they can be configured and * functionally attached to a foreign vnet as the jail's only interface * to the network. */ #ifdef INET rc = socreate(AF_INET, &so4, SOCK_DGRAM, IPPROTO_UDP, cred, curthread); if (rc) goto out; rc = udp_set_kernel_tunneling(so4, wg_input, NULL, sc); /* * udp_set_kernel_tunneling can only fail if there is already a tunneling function set. * This should never happen with a new socket. */ MPASS(rc == 0); #endif #ifdef INET6 rc = socreate(AF_INET6, &so6, SOCK_DGRAM, IPPROTO_UDP, cred, curthread); if (rc) goto out; rc = udp_set_kernel_tunneling(so6, wg_input, NULL, sc); MPASS(rc == 0); #endif if (sc->sc_socket.so_user_cookie) { rc = wg_socket_set_sockopt(so4, so6, SO_USER_COOKIE, &sc->sc_socket.so_user_cookie, sizeof(sc->sc_socket.so_user_cookie)); if (rc) goto out; } rc = wg_socket_set_sockopt(so4, so6, SO_SETFIB, &sc->sc_socket.so_fibnum, sizeof(sc->sc_socket.so_fibnum)); if (rc) goto out; rc = wg_socket_bind(&so4, &so6, &port); if (!rc) { sc->sc_socket.so_port = port; wg_socket_set(sc, so4, so6); } out: if (rc) { if (so4 != NULL) soclose(so4); if (so6 != NULL) soclose(so6); } return (rc); } static int wg_socket_set_sockopt(struct socket *so4, struct socket *so6, int name, void *val, size_t len) { int ret4 = 0, ret6 = 0; struct sockopt sopt = { .sopt_dir = SOPT_SET, .sopt_level = SOL_SOCKET, .sopt_name = name, .sopt_val = val, .sopt_valsize = len }; if (so4) ret4 = sosetopt(so4, &sopt); if (so6) ret6 = sosetopt(so6, &sopt); return (ret4 ?: ret6); } static int wg_socket_set_cookie(struct wg_softc *sc, uint32_t user_cookie) { struct wg_socket *so = &sc->sc_socket; int ret; sx_assert(&sc->sc_lock, SX_XLOCKED); ret = wg_socket_set_sockopt(so->so_so4, so->so_so6, SO_USER_COOKIE, &user_cookie, sizeof(user_cookie)); if (!ret) so->so_user_cookie = user_cookie; return (ret); } static int wg_socket_set_fibnum(struct wg_softc *sc, int fibnum) { struct wg_socket *so = &sc->sc_socket; int ret; sx_assert(&sc->sc_lock, SX_XLOCKED); ret = wg_socket_set_sockopt(so->so_so4, so->so_so6, SO_SETFIB, &fibnum, sizeof(fibnum)); if (!ret) so->so_fibnum = fibnum; return (ret); } static void wg_socket_uninit(struct wg_softc *sc) { wg_socket_set(sc, NULL, NULL); } static void wg_socket_set(struct wg_softc *sc, struct socket *new_so4, struct socket *new_so6) { struct wg_socket *so = &sc->sc_socket; struct socket *so4, *so6; sx_assert(&sc->sc_lock, SX_XLOCKED); so4 = atomic_load_ptr(&so->so_so4); so6 = atomic_load_ptr(&so->so_so6); atomic_store_ptr(&so->so_so4, new_so4); atomic_store_ptr(&so->so_so6, new_so6); if (!so4 && !so6) return; NET_EPOCH_WAIT(); if (so4) soclose(so4); if (so6) soclose(so6); } static int wg_socket_bind(struct socket **in_so4, struct socket **in_so6, in_port_t *requested_port) { struct socket *so4 = *in_so4, *so6 = *in_so6; int ret4 = 0, ret6 = 0; in_port_t port = *requested_port; struct sockaddr_in sin = { .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_port = htons(port) }; struct sockaddr_in6 sin6 = { .sin6_len = sizeof(struct sockaddr_in6), .sin6_family = AF_INET6, .sin6_port = htons(port) }; if (so4) { ret4 = sobind(so4, (struct sockaddr *)&sin, curthread); if (ret4 && ret4 != EADDRNOTAVAIL) return (ret4); if (!ret4 && !sin.sin_port) { struct sockaddr_in *bound_sin; int ret = so4->so_proto->pr_sockaddr(so4, (struct sockaddr **)&bound_sin); if (ret) return (ret); port = ntohs(bound_sin->sin_port); sin6.sin6_port = bound_sin->sin_port; free(bound_sin, M_SONAME); } } if (so6) { ret6 = sobind(so6, (struct sockaddr *)&sin6, curthread); if (ret6 && ret6 != EADDRNOTAVAIL) return (ret6); if (!ret6 && !sin6.sin6_port) { struct sockaddr_in6 *bound_sin6; int ret = so6->so_proto->pr_sockaddr(so6, (struct sockaddr **)&bound_sin6); if (ret) return (ret); port = ntohs(bound_sin6->sin6_port); free(bound_sin6, M_SONAME); } } if (ret4 && ret6) return (ret4); *requested_port = port; if (ret4 && !ret6 && so4) { soclose(so4); *in_so4 = NULL; } else if (ret6 && !ret4 && so6) { soclose(so6); *in_so6 = NULL; } return (0); } static int wg_send(struct wg_softc *sc, struct wg_endpoint *e, struct mbuf *m) { struct epoch_tracker et; struct sockaddr *sa; struct wg_socket *so = &sc->sc_socket; struct socket *so4, *so6; struct mbuf *control = NULL; int ret = 0; size_t len = m->m_pkthdr.len; /* Get local control address before locking */ if (e->e_remote.r_sa.sa_family == AF_INET) { if (e->e_local.l_in.s_addr != INADDR_ANY) control = sbcreatecontrol((caddr_t)&e->e_local.l_in, sizeof(struct in_addr), IP_SENDSRCADDR, IPPROTO_IP, M_NOWAIT); #ifdef INET6 } else if (e->e_remote.r_sa.sa_family == AF_INET6) { if (!IN6_IS_ADDR_UNSPECIFIED(&e->e_local.l_in6)) control = sbcreatecontrol((caddr_t)&e->e_local.l_pktinfo6, sizeof(struct in6_pktinfo), IPV6_PKTINFO, IPPROTO_IPV6, M_NOWAIT); #endif } else { m_freem(m); return (EAFNOSUPPORT); } /* Get remote address */ sa = &e->e_remote.r_sa; NET_EPOCH_ENTER(et); so4 = atomic_load_ptr(&so->so_so4); so6 = atomic_load_ptr(&so->so_so6); if (e->e_remote.r_sa.sa_family == AF_INET && so4 != NULL) ret = sosend(so4, sa, NULL, m, control, 0, curthread); else if (e->e_remote.r_sa.sa_family == AF_INET6 && so6 != NULL) ret = sosend(so6, sa, NULL, m, control, 0, curthread); else { ret = ENOTCONN; m_freem(control); m_freem(m); } NET_EPOCH_EXIT(et); if (ret == 0) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len); } return (ret); } static void wg_send_buf(struct wg_softc *sc, struct wg_endpoint *e, uint8_t *buf, size_t len) { struct mbuf *m; int ret = 0; bool retried = false; retry: m = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); if (!m) { ret = ENOMEM; goto out; } m_copyback(m, 0, len, buf); if (ret == 0) { ret = wg_send(sc, e, m); /* Retry if we couldn't bind to e->e_local */ if (ret == EADDRNOTAVAIL && !retried) { bzero(&e->e_local, sizeof(e->e_local)); retried = true; goto retry; } } else { ret = wg_send(sc, e, m); } out: if (ret) DPRINTF(sc, "Unable to send packet: %d\n", ret); } /* Timers */ static void wg_timers_enable(struct wg_peer *peer) { atomic_store_bool(&peer->p_enabled, true); wg_timers_run_persistent_keepalive(peer); } static void wg_timers_disable(struct wg_peer *peer) { /* By setting p_enabled = false, then calling NET_EPOCH_WAIT, we can be * sure no new handshakes are created after the wait. This is because * all callout_resets (scheduling the callout) are guarded by * p_enabled. We can be sure all sections that read p_enabled and then * optionally call callout_reset are finished as they are surrounded by * NET_EPOCH_{ENTER,EXIT}. * * However, as new callouts may be scheduled during NET_EPOCH_WAIT (but * not after), we stop all callouts leaving no callouts active. * * We should also pull NET_EPOCH_WAIT out of the FOREACH(peer) loops, but the * performance impact is acceptable for the time being. */ atomic_store_bool(&peer->p_enabled, false); NET_EPOCH_WAIT(); atomic_store_bool(&peer->p_need_another_keepalive, false); callout_stop(&peer->p_new_handshake); callout_stop(&peer->p_send_keepalive); callout_stop(&peer->p_retry_handshake); callout_stop(&peer->p_persistent_keepalive); callout_stop(&peer->p_zero_key_material); } static void wg_timers_set_persistent_keepalive(struct wg_peer *peer, uint16_t interval) { struct epoch_tracker et; if (interval != peer->p_persistent_keepalive_interval) { atomic_store_16(&peer->p_persistent_keepalive_interval, interval); NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled)) wg_timers_run_persistent_keepalive(peer); NET_EPOCH_EXIT(et); } } static void wg_timers_get_last_handshake(struct wg_peer *peer, struct wg_timespec64 *time) { mtx_lock(&peer->p_handshake_mtx); time->tv_sec = peer->p_handshake_complete.tv_sec; time->tv_nsec = peer->p_handshake_complete.tv_nsec; mtx_unlock(&peer->p_handshake_mtx); } static void wg_timers_event_data_sent(struct wg_peer *peer) { struct epoch_tracker et; NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled) && !callout_pending(&peer->p_new_handshake)) callout_reset(&peer->p_new_handshake, MSEC_2_TICKS( NEW_HANDSHAKE_TIMEOUT * 1000 + arc4random_uniform(REKEY_TIMEOUT_JITTER)), wg_timers_run_new_handshake, peer); NET_EPOCH_EXIT(et); } static void wg_timers_event_data_received(struct wg_peer *peer) { struct epoch_tracker et; NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled)) { if (!callout_pending(&peer->p_send_keepalive)) callout_reset(&peer->p_send_keepalive, MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000), wg_timers_run_send_keepalive, peer); else atomic_store_bool(&peer->p_need_another_keepalive, true); } NET_EPOCH_EXIT(et); } static void wg_timers_event_any_authenticated_packet_sent(struct wg_peer *peer) { callout_stop(&peer->p_send_keepalive); } static void wg_timers_event_any_authenticated_packet_received(struct wg_peer *peer) { callout_stop(&peer->p_new_handshake); } static void wg_timers_event_any_authenticated_packet_traversal(struct wg_peer *peer) { struct epoch_tracker et; uint16_t interval; NET_EPOCH_ENTER(et); interval = atomic_load_16(&peer->p_persistent_keepalive_interval); if (atomic_load_bool(&peer->p_enabled) && interval > 0) callout_reset(&peer->p_persistent_keepalive, MSEC_2_TICKS(interval * 1000), wg_timers_run_persistent_keepalive, peer); NET_EPOCH_EXIT(et); } static void wg_timers_event_handshake_initiated(struct wg_peer *peer) { struct epoch_tracker et; NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled)) callout_reset(&peer->p_retry_handshake, MSEC_2_TICKS( REKEY_TIMEOUT * 1000 + arc4random_uniform(REKEY_TIMEOUT_JITTER)), wg_timers_run_retry_handshake, peer); NET_EPOCH_EXIT(et); } static void wg_timers_event_handshake_complete(struct wg_peer *peer) { struct epoch_tracker et; NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled)) { mtx_lock(&peer->p_handshake_mtx); callout_stop(&peer->p_retry_handshake); peer->p_handshake_retries = 0; getnanotime(&peer->p_handshake_complete); mtx_unlock(&peer->p_handshake_mtx); wg_timers_run_send_keepalive(peer); } NET_EPOCH_EXIT(et); } static void wg_timers_event_session_derived(struct wg_peer *peer) { struct epoch_tracker et; NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled)) callout_reset(&peer->p_zero_key_material, MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000), wg_timers_run_zero_key_material, peer); NET_EPOCH_EXIT(et); } static void wg_timers_event_want_initiation(struct wg_peer *peer) { struct epoch_tracker et; NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled)) wg_timers_run_send_initiation(peer, false); NET_EPOCH_EXIT(et); } static void wg_timers_run_send_initiation(struct wg_peer *peer, bool is_retry) { if (!is_retry) peer->p_handshake_retries = 0; if (noise_remote_initiation_expired(peer->p_remote) == ETIMEDOUT) wg_send_initiation(peer); } static void wg_timers_run_retry_handshake(void *_peer) { struct epoch_tracker et; struct wg_peer *peer = _peer; mtx_lock(&peer->p_handshake_mtx); if (peer->p_handshake_retries <= MAX_TIMER_HANDSHAKES) { peer->p_handshake_retries++; mtx_unlock(&peer->p_handshake_mtx); DPRINTF(peer->p_sc, "Handshake for peer %" PRIu64 " did not complete " "after %d seconds, retrying (try %d)\n", peer->p_id, REKEY_TIMEOUT, peer->p_handshake_retries + 1); wg_peer_clear_src(peer); wg_timers_run_send_initiation(peer, true); } else { mtx_unlock(&peer->p_handshake_mtx); DPRINTF(peer->p_sc, "Handshake for peer %" PRIu64 " did not complete " "after %d retries, giving up\n", peer->p_id, MAX_TIMER_HANDSHAKES + 2); callout_stop(&peer->p_send_keepalive); wg_queue_purge(&peer->p_stage_queue); NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled) && !callout_pending(&peer->p_zero_key_material)) callout_reset(&peer->p_zero_key_material, MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000), wg_timers_run_zero_key_material, peer); NET_EPOCH_EXIT(et); } } static void wg_timers_run_send_keepalive(void *_peer) { struct epoch_tracker et; struct wg_peer *peer = _peer; wg_send_keepalive(peer); NET_EPOCH_ENTER(et); if (atomic_load_bool(&peer->p_enabled) && atomic_load_bool(&peer->p_need_another_keepalive)) { atomic_store_bool(&peer->p_need_another_keepalive, false); callout_reset(&peer->p_send_keepalive, MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000), wg_timers_run_send_keepalive, peer); } NET_EPOCH_EXIT(et); } static void wg_timers_run_new_handshake(void *_peer) { struct wg_peer *peer = _peer; DPRINTF(peer->p_sc, "Retrying handshake with peer %" PRIu64 " because we " "stopped hearing back after %d seconds\n", peer->p_id, NEW_HANDSHAKE_TIMEOUT); wg_peer_clear_src(peer); wg_timers_run_send_initiation(peer, false); } static void wg_timers_run_zero_key_material(void *_peer) { struct wg_peer *peer = _peer; DPRINTF(peer->p_sc, "Zeroing out keys for peer %" PRIu64 ", since we " "haven't received a new one in %d seconds\n", peer->p_id, REJECT_AFTER_TIME * 3); noise_remote_keypairs_clear(peer->p_remote); } static void wg_timers_run_persistent_keepalive(void *_peer) { struct wg_peer *peer = _peer; if (atomic_load_16(&peer->p_persistent_keepalive_interval) > 0) wg_send_keepalive(peer); } /* TODO Handshake */ static void wg_peer_send_buf(struct wg_peer *peer, uint8_t *buf, size_t len) { struct wg_endpoint endpoint; counter_u64_add(peer->p_tx_bytes, len); wg_timers_event_any_authenticated_packet_traversal(peer); wg_timers_event_any_authenticated_packet_sent(peer); wg_peer_get_endpoint(peer, &endpoint); wg_send_buf(peer->p_sc, &endpoint, buf, len); } static void wg_send_initiation(struct wg_peer *peer) { struct wg_pkt_initiation pkt; if (noise_create_initiation(peer->p_remote, &pkt.s_idx, pkt.ue, pkt.es, pkt.ets) != 0) return; DPRINTF(peer->p_sc, "Sending handshake initiation to peer %" PRIu64 "\n", peer->p_id); pkt.t = WG_PKT_INITIATION; cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt, sizeof(pkt) - sizeof(pkt.m)); wg_peer_send_buf(peer, (uint8_t *)&pkt, sizeof(pkt)); wg_timers_event_handshake_initiated(peer); } static void wg_send_response(struct wg_peer *peer) { struct wg_pkt_response pkt; if (noise_create_response(peer->p_remote, &pkt.s_idx, &pkt.r_idx, pkt.ue, pkt.en) != 0) return; DPRINTF(peer->p_sc, "Sending handshake response to peer %" PRIu64 "\n", peer->p_id); wg_timers_event_session_derived(peer); pkt.t = WG_PKT_RESPONSE; cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt, sizeof(pkt)-sizeof(pkt.m)); wg_peer_send_buf(peer, (uint8_t*)&pkt, sizeof(pkt)); } static void wg_send_cookie(struct wg_softc *sc, struct cookie_macs *cm, uint32_t idx, struct wg_endpoint *e) { struct wg_pkt_cookie pkt; DPRINTF(sc, "Sending cookie response for denied handshake message\n"); pkt.t = WG_PKT_COOKIE; pkt.r_idx = idx; cookie_checker_create_payload(&sc->sc_cookie, cm, pkt.nonce, pkt.ec, &e->e_remote.r_sa); wg_send_buf(sc, e, (uint8_t *)&pkt, sizeof(pkt)); } static void wg_send_keepalive(struct wg_peer *peer) { struct wg_packet *pkt; struct mbuf *m; if (wg_queue_len(&peer->p_stage_queue) > 0) goto send; if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return; if ((pkt = wg_packet_alloc(m)) == NULL) { m_freem(m); return; } wg_queue_push_staged(&peer->p_stage_queue, pkt); DPRINTF(peer->p_sc, "Sending keepalive packet to peer %" PRIu64 "\n", peer->p_id); send: wg_peer_send_staged(peer); } static void wg_handshake(struct wg_softc *sc, struct wg_packet *pkt) { struct wg_pkt_initiation *init; struct wg_pkt_response *resp; struct wg_pkt_cookie *cook; struct wg_endpoint *e; struct wg_peer *peer; struct mbuf *m; struct noise_remote *remote = NULL; int res; bool underload = false; static sbintime_t wg_last_underload; /* sbinuptime */ underload = wg_queue_len(&sc->sc_handshake_queue) >= MAX_QUEUED_HANDSHAKES / 8; if (underload) { wg_last_underload = getsbinuptime(); } else if (wg_last_underload) { underload = wg_last_underload + UNDERLOAD_TIMEOUT * SBT_1S > getsbinuptime(); if (!underload) wg_last_underload = 0; } m = pkt->p_mbuf; e = &pkt->p_endpoint; if ((pkt->p_mbuf = m = m_pullup(m, m->m_pkthdr.len)) == NULL) goto error; switch (*mtod(m, uint32_t *)) { case WG_PKT_INITIATION: init = mtod(m, struct wg_pkt_initiation *); res = cookie_checker_validate_macs(&sc->sc_cookie, &init->m, init, sizeof(*init) - sizeof(init->m), underload, &e->e_remote.r_sa, if_getvnet(sc->sc_ifp)); if (res == EINVAL) { DPRINTF(sc, "Invalid initiation MAC\n"); goto error; } else if (res == ECONNREFUSED) { DPRINTF(sc, "Handshake ratelimited\n"); goto error; } else if (res == EAGAIN) { wg_send_cookie(sc, &init->m, init->s_idx, e); goto error; } else if (res != 0) { panic("unexpected response: %d\n", res); } if (noise_consume_initiation(sc->sc_local, &remote, init->s_idx, init->ue, init->es, init->ets) != 0) { DPRINTF(sc, "Invalid handshake initiation\n"); goto error; } peer = noise_remote_arg(remote); DPRINTF(sc, "Receiving handshake initiation from peer %" PRIu64 "\n", peer->p_id); wg_peer_set_endpoint(peer, e); wg_send_response(peer); break; case WG_PKT_RESPONSE: resp = mtod(m, struct wg_pkt_response *); res = cookie_checker_validate_macs(&sc->sc_cookie, &resp->m, resp, sizeof(*resp) - sizeof(resp->m), underload, &e->e_remote.r_sa, if_getvnet(sc->sc_ifp)); if (res == EINVAL) { DPRINTF(sc, "Invalid response MAC\n"); goto error; } else if (res == ECONNREFUSED) { DPRINTF(sc, "Handshake ratelimited\n"); goto error; } else if (res == EAGAIN) { wg_send_cookie(sc, &resp->m, resp->s_idx, e); goto error; } else if (res != 0) { panic("unexpected response: %d\n", res); } if (noise_consume_response(sc->sc_local, &remote, resp->s_idx, resp->r_idx, resp->ue, resp->en) != 0) { DPRINTF(sc, "Invalid handshake response\n"); goto error; } peer = noise_remote_arg(remote); DPRINTF(sc, "Receiving handshake response from peer %" PRIu64 "\n", peer->p_id); wg_peer_set_endpoint(peer, e); wg_timers_event_session_derived(peer); wg_timers_event_handshake_complete(peer); break; case WG_PKT_COOKIE: cook = mtod(m, struct wg_pkt_cookie *); if ((remote = noise_remote_index(sc->sc_local, cook->r_idx)) == NULL) { DPRINTF(sc, "Unknown cookie index\n"); goto error; } peer = noise_remote_arg(remote); if (cookie_maker_consume_payload(&peer->p_cookie, cook->nonce, cook->ec) == 0) { DPRINTF(sc, "Receiving cookie response\n"); } else { DPRINTF(sc, "Could not decrypt cookie response\n"); goto error; } goto not_authenticated; default: panic("invalid packet in handshake queue"); } wg_timers_event_any_authenticated_packet_received(peer); wg_timers_event_any_authenticated_packet_traversal(peer); not_authenticated: counter_u64_add(peer->p_rx_bytes, m->m_pkthdr.len); if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); error: if (remote != NULL) noise_remote_put(remote); wg_packet_free(pkt); } static void wg_softc_handshake_receive(struct wg_softc *sc) { struct wg_packet *pkt; while ((pkt = wg_queue_dequeue_handshake(&sc->sc_handshake_queue)) != NULL) wg_handshake(sc, pkt); } static void wg_mbuf_reset(struct mbuf *m) { struct m_tag *t, *tmp; /* * We want to reset the mbuf to a newly allocated state, containing * just the packet contents. Unfortunately FreeBSD doesn't seem to * offer this anywhere, so we have to make it up as we go. If we can * get this in kern/kern_mbuf.c, that would be best. * * Notice: this may break things unexpectedly but it is better to fail * closed in the extreme case than leak informtion in every * case. * * With that said, all this attempts to do is remove any extraneous * information that could be present. */ M_ASSERTPKTHDR(m); m->m_flags &= ~(M_BCAST|M_MCAST|M_VLANTAG|M_PROMISC|M_PROTOFLAGS); M_HASHTYPE_CLEAR(m); #ifdef NUMA m->m_pkthdr.numa_domain = M_NODOM; #endif SLIST_FOREACH_SAFE(t, &m->m_pkthdr.tags, m_tag_link, tmp) { if ((t->m_tag_id != 0 || t->m_tag_cookie != MTAG_WGLOOP) && t->m_tag_id != PACKET_TAG_MACLABEL) m_tag_delete(m, t); } KASSERT((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0, ("%s: mbuf %p has a send tag", __func__, m)); m->m_pkthdr.csum_flags = 0; m->m_pkthdr.PH_per.sixtyfour[0] = 0; m->m_pkthdr.PH_loc.sixtyfour[0] = 0; } static inline unsigned int calculate_padding(struct wg_packet *pkt) { unsigned int padded_size, last_unit = pkt->p_mbuf->m_pkthdr.len; /* Keepalive packets don't set p_mtu, but also have a length of zero. */ if (__predict_false(pkt->p_mtu == 0)) { padded_size = (last_unit + (WG_PKT_PADDING - 1)) & ~(WG_PKT_PADDING - 1); return (padded_size - last_unit); } if (__predict_false(last_unit > pkt->p_mtu)) last_unit %= pkt->p_mtu; padded_size = (last_unit + (WG_PKT_PADDING - 1)) & ~(WG_PKT_PADDING - 1); if (pkt->p_mtu < padded_size) padded_size = pkt->p_mtu; return (padded_size - last_unit); } static void wg_encrypt(struct wg_softc *sc, struct wg_packet *pkt) { static const uint8_t padding[WG_PKT_PADDING] = { 0 }; struct wg_pkt_data *data; struct wg_peer *peer; struct noise_remote *remote; struct mbuf *m; uint32_t idx; unsigned int padlen; enum wg_ring_state state = WG_PACKET_DEAD; remote = noise_keypair_remote(pkt->p_keypair); peer = noise_remote_arg(remote); m = pkt->p_mbuf; /* Pad the packet */ padlen = calculate_padding(pkt); if (padlen != 0 && !m_append(m, padlen, padding)) goto out; /* Do encryption */ if (noise_keypair_encrypt(pkt->p_keypair, &idx, pkt->p_nonce, m) != 0) goto out; /* Put header into packet */ M_PREPEND(m, sizeof(struct wg_pkt_data), M_NOWAIT); if (m == NULL) goto out; data = mtod(m, struct wg_pkt_data *); data->t = WG_PKT_DATA; data->r_idx = idx; data->nonce = htole64(pkt->p_nonce); wg_mbuf_reset(m); state = WG_PACKET_CRYPTED; out: pkt->p_mbuf = m; atomic_store_rel_int(&pkt->p_state, state); GROUPTASK_ENQUEUE(&peer->p_send); noise_remote_put(remote); } static void wg_decrypt(struct wg_softc *sc, struct wg_packet *pkt) { struct wg_peer *peer, *allowed_peer; struct noise_remote *remote; struct mbuf *m; int len; enum wg_ring_state state = WG_PACKET_DEAD; remote = noise_keypair_remote(pkt->p_keypair); peer = noise_remote_arg(remote); m = pkt->p_mbuf; /* Read nonce and then adjust to remove the header. */ pkt->p_nonce = le64toh(mtod(m, struct wg_pkt_data *)->nonce); m_adj(m, sizeof(struct wg_pkt_data)); if (noise_keypair_decrypt(pkt->p_keypair, pkt->p_nonce, m) != 0) goto out; /* A packet with length 0 is a keepalive packet */ if (__predict_false(m->m_pkthdr.len == 0)) { DPRINTF(sc, "Receiving keepalive packet from peer " "%" PRIu64 "\n", peer->p_id); state = WG_PACKET_CRYPTED; goto out; } /* * We can let the network stack handle the intricate validation of the * IP header, we just worry about the sizeof and the version, so we can * read the source address in wg_aip_lookup. */ if (determine_af_and_pullup(&m, &pkt->p_af) == 0) { if (pkt->p_af == AF_INET) { struct ip *ip = mtod(m, struct ip *); allowed_peer = wg_aip_lookup(sc, AF_INET, &ip->ip_src); len = ntohs(ip->ip_len); if (len >= sizeof(struct ip) && len < m->m_pkthdr.len) m_adj(m, len - m->m_pkthdr.len); } else if (pkt->p_af == AF_INET6) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); allowed_peer = wg_aip_lookup(sc, AF_INET6, &ip6->ip6_src); len = ntohs(ip6->ip6_plen) + sizeof(struct ip6_hdr); if (len < m->m_pkthdr.len) m_adj(m, len - m->m_pkthdr.len); } else panic("determine_af_and_pullup returned unexpected value"); } else { DPRINTF(sc, "Packet is neither ipv4 nor ipv6 from peer %" PRIu64 "\n", peer->p_id); goto out; } /* We only want to compare the address, not dereference, so drop the ref. */ if (allowed_peer != NULL) noise_remote_put(allowed_peer->p_remote); if (__predict_false(peer != allowed_peer)) { DPRINTF(sc, "Packet has unallowed src IP from peer %" PRIu64 "\n", peer->p_id); goto out; } wg_mbuf_reset(m); state = WG_PACKET_CRYPTED; out: pkt->p_mbuf = m; atomic_store_rel_int(&pkt->p_state, state); GROUPTASK_ENQUEUE(&peer->p_recv); noise_remote_put(remote); } static void wg_softc_decrypt(struct wg_softc *sc) { struct wg_packet *pkt; while ((pkt = wg_queue_dequeue_parallel(&sc->sc_decrypt_parallel)) != NULL) wg_decrypt(sc, pkt); } static void wg_softc_encrypt(struct wg_softc *sc) { struct wg_packet *pkt; while ((pkt = wg_queue_dequeue_parallel(&sc->sc_encrypt_parallel)) != NULL) wg_encrypt(sc, pkt); } static void wg_encrypt_dispatch(struct wg_softc *sc) { /* * The update to encrypt_last_cpu is racey such that we may * reschedule the task for the same CPU multiple times, but * the race doesn't really matter. */ u_int cpu = (sc->sc_encrypt_last_cpu + 1) % mp_ncpus; sc->sc_encrypt_last_cpu = cpu; GROUPTASK_ENQUEUE(&sc->sc_encrypt[cpu]); } static void wg_decrypt_dispatch(struct wg_softc *sc) { u_int cpu = (sc->sc_decrypt_last_cpu + 1) % mp_ncpus; sc->sc_decrypt_last_cpu = cpu; GROUPTASK_ENQUEUE(&sc->sc_decrypt[cpu]); } static void wg_deliver_out(struct wg_peer *peer) { struct wg_endpoint endpoint; struct wg_softc *sc = peer->p_sc; struct wg_packet *pkt; struct mbuf *m; int rc, len; wg_peer_get_endpoint(peer, &endpoint); while ((pkt = wg_queue_dequeue_serial(&peer->p_encrypt_serial)) != NULL) { if (atomic_load_acq_int(&pkt->p_state) != WG_PACKET_CRYPTED) goto error; m = pkt->p_mbuf; pkt->p_mbuf = NULL; len = m->m_pkthdr.len; wg_timers_event_any_authenticated_packet_traversal(peer); wg_timers_event_any_authenticated_packet_sent(peer); rc = wg_send(sc, &endpoint, m); if (rc == 0) { if (len > (sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN)) wg_timers_event_data_sent(peer); counter_u64_add(peer->p_tx_bytes, len); } else if (rc == EADDRNOTAVAIL) { wg_peer_clear_src(peer); wg_peer_get_endpoint(peer, &endpoint); goto error; } else { goto error; } wg_packet_free(pkt); if (noise_keep_key_fresh_send(peer->p_remote)) wg_timers_event_want_initiation(peer); continue; error: if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); wg_packet_free(pkt); } } static void wg_deliver_in(struct wg_peer *peer) { struct wg_softc *sc = peer->p_sc; if_t ifp = sc->sc_ifp; struct wg_packet *pkt; struct mbuf *m; struct epoch_tracker et; while ((pkt = wg_queue_dequeue_serial(&peer->p_decrypt_serial)) != NULL) { if (atomic_load_acq_int(&pkt->p_state) != WG_PACKET_CRYPTED) goto error; m = pkt->p_mbuf; if (noise_keypair_nonce_check(pkt->p_keypair, pkt->p_nonce) != 0) goto error; if (noise_keypair_received_with(pkt->p_keypair) == ECONNRESET) wg_timers_event_handshake_complete(peer); wg_timers_event_any_authenticated_packet_received(peer); wg_timers_event_any_authenticated_packet_traversal(peer); wg_peer_set_endpoint(peer, &pkt->p_endpoint); counter_u64_add(peer->p_rx_bytes, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN); if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN); if (m->m_pkthdr.len == 0) goto done; MPASS(pkt->p_af == AF_INET || pkt->p_af == AF_INET6); pkt->p_mbuf = NULL; m->m_pkthdr.rcvif = ifp; NET_EPOCH_ENTER(et); BPF_MTAP2_AF(ifp, m, pkt->p_af); CURVNET_SET(if_getvnet(ifp)); M_SETFIB(m, if_getfib(ifp)); if (pkt->p_af == AF_INET) netisr_dispatch(NETISR_IP, m); if (pkt->p_af == AF_INET6) netisr_dispatch(NETISR_IPV6, m); CURVNET_RESTORE(); NET_EPOCH_EXIT(et); wg_timers_event_data_received(peer); done: if (noise_keep_key_fresh_recv(peer->p_remote)) wg_timers_event_want_initiation(peer); wg_packet_free(pkt); continue; error: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); wg_packet_free(pkt); } } static struct wg_packet * wg_packet_alloc(struct mbuf *m) { struct wg_packet *pkt; if ((pkt = uma_zalloc(wg_packet_zone, M_NOWAIT | M_ZERO)) == NULL) return (NULL); pkt->p_mbuf = m; return (pkt); } static void wg_packet_free(struct wg_packet *pkt) { if (pkt->p_keypair != NULL) noise_keypair_put(pkt->p_keypair); if (pkt->p_mbuf != NULL) m_freem(pkt->p_mbuf); uma_zfree(wg_packet_zone, pkt); } static void wg_queue_init(struct wg_queue *queue, const char *name) { mtx_init(&queue->q_mtx, name, NULL, MTX_DEF); STAILQ_INIT(&queue->q_queue); queue->q_len = 0; } static void wg_queue_deinit(struct wg_queue *queue) { wg_queue_purge(queue); mtx_destroy(&queue->q_mtx); } static size_t wg_queue_len(struct wg_queue *queue) { return (queue->q_len); } static int wg_queue_enqueue_handshake(struct wg_queue *hs, struct wg_packet *pkt) { int ret = 0; mtx_lock(&hs->q_mtx); if (hs->q_len < MAX_QUEUED_HANDSHAKES) { STAILQ_INSERT_TAIL(&hs->q_queue, pkt, p_parallel); hs->q_len++; } else { ret = ENOBUFS; } mtx_unlock(&hs->q_mtx); if (ret != 0) wg_packet_free(pkt); return (ret); } static struct wg_packet * wg_queue_dequeue_handshake(struct wg_queue *hs) { struct wg_packet *pkt; mtx_lock(&hs->q_mtx); if ((pkt = STAILQ_FIRST(&hs->q_queue)) != NULL) { STAILQ_REMOVE_HEAD(&hs->q_queue, p_parallel); hs->q_len--; } mtx_unlock(&hs->q_mtx); return (pkt); } static void wg_queue_push_staged(struct wg_queue *staged, struct wg_packet *pkt) { struct wg_packet *old = NULL; mtx_lock(&staged->q_mtx); if (staged->q_len >= MAX_STAGED_PKT) { old = STAILQ_FIRST(&staged->q_queue); STAILQ_REMOVE_HEAD(&staged->q_queue, p_parallel); staged->q_len--; } STAILQ_INSERT_TAIL(&staged->q_queue, pkt, p_parallel); staged->q_len++; mtx_unlock(&staged->q_mtx); if (old != NULL) wg_packet_free(old); } static void wg_queue_enlist_staged(struct wg_queue *staged, struct wg_packet_list *list) { struct wg_packet *pkt, *tpkt; STAILQ_FOREACH_SAFE(pkt, list, p_parallel, tpkt) wg_queue_push_staged(staged, pkt); } static void wg_queue_delist_staged(struct wg_queue *staged, struct wg_packet_list *list) { STAILQ_INIT(list); mtx_lock(&staged->q_mtx); STAILQ_CONCAT(list, &staged->q_queue); staged->q_len = 0; mtx_unlock(&staged->q_mtx); } static void wg_queue_purge(struct wg_queue *staged) { struct wg_packet_list list; struct wg_packet *pkt, *tpkt; wg_queue_delist_staged(staged, &list); STAILQ_FOREACH_SAFE(pkt, &list, p_parallel, tpkt) wg_packet_free(pkt); } static int wg_queue_both(struct wg_queue *parallel, struct wg_queue *serial, struct wg_packet *pkt) { pkt->p_state = WG_PACKET_UNCRYPTED; mtx_lock(&serial->q_mtx); if (serial->q_len < MAX_QUEUED_PKT) { serial->q_len++; STAILQ_INSERT_TAIL(&serial->q_queue, pkt, p_serial); } else { mtx_unlock(&serial->q_mtx); wg_packet_free(pkt); return (ENOBUFS); } mtx_unlock(&serial->q_mtx); mtx_lock(¶llel->q_mtx); if (parallel->q_len < MAX_QUEUED_PKT) { parallel->q_len++; STAILQ_INSERT_TAIL(¶llel->q_queue, pkt, p_parallel); } else { mtx_unlock(¶llel->q_mtx); pkt->p_state = WG_PACKET_DEAD; return (ENOBUFS); } mtx_unlock(¶llel->q_mtx); return (0); } static struct wg_packet * wg_queue_dequeue_serial(struct wg_queue *serial) { struct wg_packet *pkt = NULL; mtx_lock(&serial->q_mtx); if (serial->q_len > 0 && STAILQ_FIRST(&serial->q_queue)->p_state != WG_PACKET_UNCRYPTED) { serial->q_len--; pkt = STAILQ_FIRST(&serial->q_queue); STAILQ_REMOVE_HEAD(&serial->q_queue, p_serial); } mtx_unlock(&serial->q_mtx); return (pkt); } static struct wg_packet * wg_queue_dequeue_parallel(struct wg_queue *parallel) { struct wg_packet *pkt = NULL; mtx_lock(¶llel->q_mtx); if (parallel->q_len > 0) { parallel->q_len--; pkt = STAILQ_FIRST(¶llel->q_queue); STAILQ_REMOVE_HEAD(¶llel->q_queue, p_parallel); } mtx_unlock(¶llel->q_mtx); return (pkt); } static bool wg_input(struct mbuf *m, int offset, struct inpcb *inpcb, const struct sockaddr *sa, void *_sc) { #ifdef INET const struct sockaddr_in *sin; #endif #ifdef INET6 const struct sockaddr_in6 *sin6; #endif struct noise_remote *remote; struct wg_pkt_data *data; struct wg_packet *pkt; struct wg_peer *peer; struct wg_softc *sc = _sc; struct mbuf *defragged; defragged = m_defrag(m, M_NOWAIT); if (defragged) m = defragged; m = m_unshare(m, M_NOWAIT); if (!m) { if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1); return true; } /* Caller provided us with `sa`, no need for this header. */ m_adj(m, offset + sizeof(struct udphdr)); /* Pullup enough to read packet type */ if ((m = m_pullup(m, sizeof(uint32_t))) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1); return true; } if ((pkt = wg_packet_alloc(m)) == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1); m_freem(m); return true; } /* Save send/recv address and port for later. */ switch (sa->sa_family) { #ifdef INET case AF_INET: sin = (const struct sockaddr_in *)sa; pkt->p_endpoint.e_remote.r_sin = sin[0]; pkt->p_endpoint.e_local.l_in = sin[1].sin_addr; break; #endif #ifdef INET6 case AF_INET6: sin6 = (const struct sockaddr_in6 *)sa; pkt->p_endpoint.e_remote.r_sin6 = sin6[0]; pkt->p_endpoint.e_local.l_in6 = sin6[1].sin6_addr; break; #endif default: goto error; } if ((m->m_pkthdr.len == sizeof(struct wg_pkt_initiation) && *mtod(m, uint32_t *) == WG_PKT_INITIATION) || (m->m_pkthdr.len == sizeof(struct wg_pkt_response) && *mtod(m, uint32_t *) == WG_PKT_RESPONSE) || (m->m_pkthdr.len == sizeof(struct wg_pkt_cookie) && *mtod(m, uint32_t *) == WG_PKT_COOKIE)) { if (wg_queue_enqueue_handshake(&sc->sc_handshake_queue, pkt) != 0) { if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1); DPRINTF(sc, "Dropping handshake packet\n"); } GROUPTASK_ENQUEUE(&sc->sc_handshake); } else if (m->m_pkthdr.len >= sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN && *mtod(m, uint32_t *) == WG_PKT_DATA) { /* Pullup whole header to read r_idx below. */ if ((pkt->p_mbuf = m_pullup(m, sizeof(struct wg_pkt_data))) == NULL) goto error; data = mtod(pkt->p_mbuf, struct wg_pkt_data *); if ((pkt->p_keypair = noise_keypair_lookup(sc->sc_local, data->r_idx)) == NULL) goto error; remote = noise_keypair_remote(pkt->p_keypair); peer = noise_remote_arg(remote); if (wg_queue_both(&sc->sc_decrypt_parallel, &peer->p_decrypt_serial, pkt) != 0) if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1); wg_decrypt_dispatch(sc); noise_remote_put(remote); } else { goto error; } return true; error: if_inc_counter(sc->sc_ifp, IFCOUNTER_IERRORS, 1); wg_packet_free(pkt); return true; } static void wg_peer_send_staged(struct wg_peer *peer) { struct wg_packet_list list; struct noise_keypair *keypair; struct wg_packet *pkt, *tpkt; struct wg_softc *sc = peer->p_sc; wg_queue_delist_staged(&peer->p_stage_queue, &list); if (STAILQ_EMPTY(&list)) return; if ((keypair = noise_keypair_current(peer->p_remote)) == NULL) goto error; STAILQ_FOREACH(pkt, &list, p_parallel) { if (noise_keypair_nonce_next(keypair, &pkt->p_nonce) != 0) goto error_keypair; } STAILQ_FOREACH_SAFE(pkt, &list, p_parallel, tpkt) { pkt->p_keypair = noise_keypair_ref(keypair); if (wg_queue_both(&sc->sc_encrypt_parallel, &peer->p_encrypt_serial, pkt) != 0) if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1); } wg_encrypt_dispatch(sc); noise_keypair_put(keypair); return; error_keypair: noise_keypair_put(keypair); error: wg_queue_enlist_staged(&peer->p_stage_queue, &list); wg_timers_event_want_initiation(peer); } static inline void xmit_err(if_t ifp, struct mbuf *m, struct wg_packet *pkt, sa_family_t af) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); switch (af) { #ifdef INET case AF_INET: icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); if (pkt) pkt->p_mbuf = NULL; m = NULL; break; #endif #ifdef INET6 case AF_INET6: icmp6_error(m, ICMP6_DST_UNREACH, 0, 0); if (pkt) pkt->p_mbuf = NULL; m = NULL; break; #endif } if (pkt) wg_packet_free(pkt); else if (m) m_freem(m); } static int wg_xmit(if_t ifp, struct mbuf *m, sa_family_t af, uint32_t mtu) { struct wg_packet *pkt = NULL; struct wg_softc *sc = if_getsoftc(ifp); struct wg_peer *peer; int rc = 0; sa_family_t peer_af; /* Work around lifetime issue in the ipv6 mld code. */ if (__predict_false((if_getflags(ifp) & IFF_DYING) || !sc)) { rc = ENXIO; goto err_xmit; } if ((pkt = wg_packet_alloc(m)) == NULL) { rc = ENOBUFS; goto err_xmit; } pkt->p_mtu = mtu; pkt->p_af = af; if (af == AF_INET) { peer = wg_aip_lookup(sc, AF_INET, &mtod(m, struct ip *)->ip_dst); } else if (af == AF_INET6) { peer = wg_aip_lookup(sc, AF_INET6, &mtod(m, struct ip6_hdr *)->ip6_dst); } else { rc = EAFNOSUPPORT; goto err_xmit; } BPF_MTAP2_AF(ifp, m, pkt->p_af); if (__predict_false(peer == NULL)) { rc = ENETUNREACH; goto err_xmit; } if (__predict_false(if_tunnel_check_nesting(ifp, m, MTAG_WGLOOP, MAX_LOOPS))) { DPRINTF(sc, "Packet looped"); rc = ELOOP; goto err_peer; } peer_af = peer->p_endpoint.e_remote.r_sa.sa_family; if (__predict_false(peer_af != AF_INET && peer_af != AF_INET6)) { DPRINTF(sc, "No valid endpoint has been configured or " "discovered for peer %" PRIu64 "\n", peer->p_id); rc = EHOSTUNREACH; goto err_peer; } wg_queue_push_staged(&peer->p_stage_queue, pkt); wg_peer_send_staged(peer); noise_remote_put(peer->p_remote); return (0); err_peer: noise_remote_put(peer->p_remote); err_xmit: xmit_err(ifp, m, pkt, af); return (rc); } static inline int determine_af_and_pullup(struct mbuf **m, sa_family_t *af) { u_char ipv; if ((*m)->m_pkthdr.len >= sizeof(struct ip6_hdr)) *m = m_pullup(*m, sizeof(struct ip6_hdr)); else if ((*m)->m_pkthdr.len >= sizeof(struct ip)) *m = m_pullup(*m, sizeof(struct ip)); else return (EAFNOSUPPORT); if (*m == NULL) return (ENOBUFS); ipv = mtod(*m, struct ip *)->ip_v; if (ipv == 4) *af = AF_INET; else if (ipv == 6 && (*m)->m_pkthdr.len >= sizeof(struct ip6_hdr)) *af = AF_INET6; else return (EAFNOSUPPORT); return (0); } static int wg_transmit(if_t ifp, struct mbuf *m) { sa_family_t af; int ret; struct mbuf *defragged; defragged = m_defrag(m, M_NOWAIT); if (defragged) m = defragged; m = m_unshare(m, M_NOWAIT); if (!m) { xmit_err(ifp, m, NULL, AF_UNSPEC); return (ENOBUFS); } ret = determine_af_and_pullup(&m, &af); if (ret) { xmit_err(ifp, m, NULL, AF_UNSPEC); return (ret); } return (wg_xmit(ifp, m, af, if_getmtu(ifp))); } static int wg_output(if_t ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { sa_family_t parsed_af; uint32_t af, mtu; int ret; struct mbuf *defragged; - if (dst->sa_family == AF_UNSPEC) + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) memcpy(&af, dst->sa_data, sizeof(af)); else af = dst->sa_family; if (af == AF_UNSPEC) { xmit_err(ifp, m, NULL, af); return (EAFNOSUPPORT); } defragged = m_defrag(m, M_NOWAIT); if (defragged) m = defragged; m = m_unshare(m, M_NOWAIT); if (!m) { xmit_err(ifp, m, NULL, AF_UNSPEC); return (ENOBUFS); } ret = determine_af_and_pullup(&m, &parsed_af); if (ret) { xmit_err(ifp, m, NULL, AF_UNSPEC); return (ret); } if (parsed_af != af) { xmit_err(ifp, m, NULL, AF_UNSPEC); return (EAFNOSUPPORT); } mtu = (ro != NULL && ro->ro_mtu > 0) ? ro->ro_mtu : if_getmtu(ifp); return (wg_xmit(ifp, m, parsed_af, mtu)); } static int wg_peer_add(struct wg_softc *sc, const nvlist_t *nvl) { uint8_t public[WG_KEY_SIZE]; const void *pub_key, *preshared_key = NULL; const struct sockaddr *endpoint; int err; size_t size; struct noise_remote *remote; struct wg_peer *peer = NULL; bool need_insert = false; sx_assert(&sc->sc_lock, SX_XLOCKED); if (!nvlist_exists_binary(nvl, "public-key")) { return (EINVAL); } pub_key = nvlist_get_binary(nvl, "public-key", &size); if (size != WG_KEY_SIZE) { return (EINVAL); } if (noise_local_keys(sc->sc_local, public, NULL) == 0 && bcmp(public, pub_key, WG_KEY_SIZE) == 0) { return (0); // Silently ignored; not actually a failure. } if ((remote = noise_remote_lookup(sc->sc_local, pub_key)) != NULL) peer = noise_remote_arg(remote); if (nvlist_exists_bool(nvl, "remove") && nvlist_get_bool(nvl, "remove")) { if (remote != NULL) { wg_peer_destroy(peer); noise_remote_put(remote); } return (0); } if (nvlist_exists_bool(nvl, "replace-allowedips") && nvlist_get_bool(nvl, "replace-allowedips") && peer != NULL) { wg_aip_remove_all(sc, peer); } if (peer == NULL) { peer = wg_peer_alloc(sc, pub_key); need_insert = true; } if (nvlist_exists_binary(nvl, "endpoint")) { endpoint = nvlist_get_binary(nvl, "endpoint", &size); if (size > sizeof(peer->p_endpoint.e_remote)) { err = EINVAL; goto out; } memcpy(&peer->p_endpoint.e_remote, endpoint, size); } if (nvlist_exists_binary(nvl, "preshared-key")) { preshared_key = nvlist_get_binary(nvl, "preshared-key", &size); if (size != WG_KEY_SIZE) { err = EINVAL; goto out; } noise_remote_set_psk(peer->p_remote, preshared_key); } if (nvlist_exists_number(nvl, "persistent-keepalive-interval")) { uint64_t pki = nvlist_get_number(nvl, "persistent-keepalive-interval"); if (pki > UINT16_MAX) { err = EINVAL; goto out; } wg_timers_set_persistent_keepalive(peer, pki); } if (nvlist_exists_nvlist_array(nvl, "allowed-ips")) { const void *addr; uint64_t cidr; const nvlist_t * const * aipl; size_t allowedip_count; aipl = nvlist_get_nvlist_array(nvl, "allowed-ips", &allowedip_count); for (size_t idx = 0; idx < allowedip_count; idx++) { if (!nvlist_exists_number(aipl[idx], "cidr")) continue; cidr = nvlist_get_number(aipl[idx], "cidr"); if (nvlist_exists_binary(aipl[idx], "ipv4")) { addr = nvlist_get_binary(aipl[idx], "ipv4", &size); if (addr == NULL || cidr > 32 || size != sizeof(struct in_addr)) { err = EINVAL; goto out; } if ((err = wg_aip_add(sc, peer, AF_INET, addr, cidr)) != 0) goto out; } else if (nvlist_exists_binary(aipl[idx], "ipv6")) { addr = nvlist_get_binary(aipl[idx], "ipv6", &size); if (addr == NULL || cidr > 128 || size != sizeof(struct in6_addr)) { err = EINVAL; goto out; } if ((err = wg_aip_add(sc, peer, AF_INET6, addr, cidr)) != 0) goto out; } else { continue; } } } if (need_insert) { if ((err = noise_remote_enable(peer->p_remote)) != 0) goto out; TAILQ_INSERT_TAIL(&sc->sc_peers, peer, p_entry); sc->sc_peers_num++; if (if_getlinkstate(sc->sc_ifp) == LINK_STATE_UP) wg_timers_enable(peer); } if (remote != NULL) noise_remote_put(remote); return (0); out: if (need_insert) /* If we fail, only destroy if it was new. */ wg_peer_destroy(peer); if (remote != NULL) noise_remote_put(remote); return (err); } static int wgc_set(struct wg_softc *sc, struct wg_data_io *wgd) { uint8_t public[WG_KEY_SIZE], private[WG_KEY_SIZE]; if_t ifp; void *nvlpacked; nvlist_t *nvl; ssize_t size; int err; ifp = sc->sc_ifp; if (wgd->wgd_size == 0 || wgd->wgd_data == NULL) return (EFAULT); /* Can nvlists be streamed in? It's not nice to impose arbitrary limits like that but * there needs to be _some_ limitation. */ if (wgd->wgd_size >= UINT32_MAX / 2) return (E2BIG); nvlpacked = malloc(wgd->wgd_size, M_TEMP, M_WAITOK | M_ZERO); err = copyin(wgd->wgd_data, nvlpacked, wgd->wgd_size); if (err) goto out; nvl = nvlist_unpack(nvlpacked, wgd->wgd_size, 0); if (nvl == NULL) { err = EBADMSG; goto out; } sx_xlock(&sc->sc_lock); if (nvlist_exists_bool(nvl, "replace-peers") && nvlist_get_bool(nvl, "replace-peers")) wg_peer_destroy_all(sc); if (nvlist_exists_number(nvl, "listen-port")) { uint64_t new_port = nvlist_get_number(nvl, "listen-port"); if (new_port > UINT16_MAX) { err = EINVAL; goto out_locked; } if (new_port != sc->sc_socket.so_port) { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { if ((err = wg_socket_init(sc, new_port)) != 0) goto out_locked; } else sc->sc_socket.so_port = new_port; } } if (nvlist_exists_binary(nvl, "private-key")) { const void *key = nvlist_get_binary(nvl, "private-key", &size); if (size != WG_KEY_SIZE) { err = EINVAL; goto out_locked; } if (noise_local_keys(sc->sc_local, NULL, private) != 0 || timingsafe_bcmp(private, key, WG_KEY_SIZE) != 0) { struct wg_peer *peer; if (curve25519_generate_public(public, key)) { /* Peer conflict: remove conflicting peer. */ struct noise_remote *remote; if ((remote = noise_remote_lookup(sc->sc_local, public)) != NULL) { peer = noise_remote_arg(remote); wg_peer_destroy(peer); noise_remote_put(remote); } } /* * Set the private key and invalidate all existing * handshakes. */ /* Note: we might be removing the private key. */ noise_local_private(sc->sc_local, key); if (noise_local_keys(sc->sc_local, NULL, NULL) == 0) cookie_checker_update(&sc->sc_cookie, public); else cookie_checker_update(&sc->sc_cookie, NULL); } } if (nvlist_exists_number(nvl, "user-cookie")) { uint64_t user_cookie = nvlist_get_number(nvl, "user-cookie"); if (user_cookie > UINT32_MAX) { err = EINVAL; goto out_locked; } err = wg_socket_set_cookie(sc, user_cookie); if (err) goto out_locked; } if (nvlist_exists_nvlist_array(nvl, "peers")) { size_t peercount; const nvlist_t * const*nvl_peers; nvl_peers = nvlist_get_nvlist_array(nvl, "peers", &peercount); for (int i = 0; i < peercount; i++) { err = wg_peer_add(sc, nvl_peers[i]); if (err != 0) goto out_locked; } } out_locked: sx_xunlock(&sc->sc_lock); nvlist_destroy(nvl); out: zfree(nvlpacked, M_TEMP); return (err); } static int wgc_get(struct wg_softc *sc, struct wg_data_io *wgd) { uint8_t public_key[WG_KEY_SIZE] = { 0 }; uint8_t private_key[WG_KEY_SIZE] = { 0 }; uint8_t preshared_key[NOISE_SYMMETRIC_KEY_LEN] = { 0 }; nvlist_t *nvl, *nvl_peer, *nvl_aip, **nvl_peers, **nvl_aips; size_t size, peer_count, aip_count, i, j; struct wg_timespec64 ts64; struct wg_peer *peer; struct wg_aip *aip; void *packed; int err = 0; nvl = nvlist_create(0); if (!nvl) return (ENOMEM); sx_slock(&sc->sc_lock); if (sc->sc_socket.so_port != 0) nvlist_add_number(nvl, "listen-port", sc->sc_socket.so_port); if (sc->sc_socket.so_user_cookie != 0) nvlist_add_number(nvl, "user-cookie", sc->sc_socket.so_user_cookie); if (noise_local_keys(sc->sc_local, public_key, private_key) == 0) { nvlist_add_binary(nvl, "public-key", public_key, WG_KEY_SIZE); if (wgc_privileged(sc)) nvlist_add_binary(nvl, "private-key", private_key, WG_KEY_SIZE); explicit_bzero(private_key, sizeof(private_key)); } peer_count = sc->sc_peers_num; if (peer_count) { nvl_peers = mallocarray(peer_count, sizeof(void *), M_NVLIST, M_WAITOK | M_ZERO); i = 0; TAILQ_FOREACH(peer, &sc->sc_peers, p_entry) { if (i >= peer_count) panic("peers changed from under us"); nvl_peers[i++] = nvl_peer = nvlist_create(0); if (!nvl_peer) { err = ENOMEM; goto err_peer; } (void)noise_remote_keys(peer->p_remote, public_key, preshared_key); nvlist_add_binary(nvl_peer, "public-key", public_key, sizeof(public_key)); if (wgc_privileged(sc)) nvlist_add_binary(nvl_peer, "preshared-key", preshared_key, sizeof(preshared_key)); explicit_bzero(preshared_key, sizeof(preshared_key)); if (peer->p_endpoint.e_remote.r_sa.sa_family == AF_INET) nvlist_add_binary(nvl_peer, "endpoint", &peer->p_endpoint.e_remote, sizeof(struct sockaddr_in)); else if (peer->p_endpoint.e_remote.r_sa.sa_family == AF_INET6) nvlist_add_binary(nvl_peer, "endpoint", &peer->p_endpoint.e_remote, sizeof(struct sockaddr_in6)); wg_timers_get_last_handshake(peer, &ts64); nvlist_add_binary(nvl_peer, "last-handshake-time", &ts64, sizeof(ts64)); nvlist_add_number(nvl_peer, "persistent-keepalive-interval", peer->p_persistent_keepalive_interval); nvlist_add_number(nvl_peer, "rx-bytes", counter_u64_fetch(peer->p_rx_bytes)); nvlist_add_number(nvl_peer, "tx-bytes", counter_u64_fetch(peer->p_tx_bytes)); aip_count = peer->p_aips_num; if (aip_count) { nvl_aips = mallocarray(aip_count, sizeof(void *), M_NVLIST, M_WAITOK | M_ZERO); j = 0; LIST_FOREACH(aip, &peer->p_aips, a_entry) { if (j >= aip_count) panic("aips changed from under us"); nvl_aips[j++] = nvl_aip = nvlist_create(0); if (!nvl_aip) { err = ENOMEM; goto err_aip; } if (aip->a_af == AF_INET) { nvlist_add_binary(nvl_aip, "ipv4", &aip->a_addr.in, sizeof(aip->a_addr.in)); nvlist_add_number(nvl_aip, "cidr", bitcount32(aip->a_mask.ip)); } #ifdef INET6 else if (aip->a_af == AF_INET6) { nvlist_add_binary(nvl_aip, "ipv6", &aip->a_addr.in6, sizeof(aip->a_addr.in6)); nvlist_add_number(nvl_aip, "cidr", in6_mask2len(&aip->a_mask.in6, NULL)); } #endif } nvlist_add_nvlist_array(nvl_peer, "allowed-ips", (const nvlist_t *const *)nvl_aips, aip_count); err_aip: for (j = 0; j < aip_count; ++j) nvlist_destroy(nvl_aips[j]); free(nvl_aips, M_NVLIST); if (err) goto err_peer; } } nvlist_add_nvlist_array(nvl, "peers", (const nvlist_t * const *)nvl_peers, peer_count); err_peer: for (i = 0; i < peer_count; ++i) nvlist_destroy(nvl_peers[i]); free(nvl_peers, M_NVLIST); if (err) { sx_sunlock(&sc->sc_lock); goto err; } } sx_sunlock(&sc->sc_lock); packed = nvlist_pack(nvl, &size); if (!packed) { err = ENOMEM; goto err; } if (!wgd->wgd_size) { wgd->wgd_size = size; goto out; } if (wgd->wgd_size < size) { err = ENOSPC; goto out; } err = copyout(packed, wgd->wgd_data, size); wgd->wgd_size = size; out: zfree(packed, M_NVLIST); err: nvlist_destroy(nvl); return (err); } static int wg_ioctl(if_t ifp, u_long cmd, caddr_t data) { struct wg_data_io *wgd = (struct wg_data_io *)data; struct ifreq *ifr = (struct ifreq *)data; struct wg_softc *sc; int ret = 0; sx_slock(&wg_sx); sc = if_getsoftc(ifp); if (!sc) { ret = ENXIO; goto out; } switch (cmd) { case SIOCSWG: ret = priv_check(curthread, PRIV_NET_WG); if (ret == 0) ret = wgc_set(sc, wgd); break; case SIOCGWG: ret = wgc_get(sc, wgd); break; /* Interface IOCTLs */ case SIOCSIFADDR: /* * This differs from *BSD norms, but is more uniform with how * WireGuard behaves elsewhere. */ break; case SIOCSIFFLAGS: if (if_getflags(ifp) & IFF_UP) ret = wg_up(sc); else wg_down(sc); break; case SIOCSIFMTU: if (ifr->ifr_mtu <= 0 || ifr->ifr_mtu > MAX_MTU) ret = EINVAL; else if_setmtu(ifp, ifr->ifr_mtu); break; case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGTUNFIB: ifr->ifr_fib = sc->sc_socket.so_fibnum; break; case SIOCSTUNFIB: ret = priv_check(curthread, PRIV_NET_WG); if (ret) break; ret = priv_check(curthread, PRIV_NET_SETIFFIB); if (ret) break; sx_xlock(&sc->sc_lock); ret = wg_socket_set_fibnum(sc, ifr->ifr_fib); sx_xunlock(&sc->sc_lock); break; default: ret = ENOTTY; } out: sx_sunlock(&wg_sx); return (ret); } static int wg_up(struct wg_softc *sc) { if_t ifp = sc->sc_ifp; struct wg_peer *peer; int rc = EBUSY; sx_xlock(&sc->sc_lock); /* Jail's being removed, no more wg_up(). */ if ((sc->sc_flags & WGF_DYING) != 0) goto out; /* Silent success if we're already running. */ rc = 0; if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) goto out; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); rc = wg_socket_init(sc, sc->sc_socket.so_port); if (rc == 0) { TAILQ_FOREACH(peer, &sc->sc_peers, p_entry) wg_timers_enable(peer); if_link_state_change(sc->sc_ifp, LINK_STATE_UP); } else { if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); DPRINTF(sc, "Unable to initialize sockets: %d\n", rc); } out: sx_xunlock(&sc->sc_lock); return (rc); } static void wg_down(struct wg_softc *sc) { if_t ifp = sc->sc_ifp; struct wg_peer *peer; sx_xlock(&sc->sc_lock); if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { sx_xunlock(&sc->sc_lock); return; } if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); TAILQ_FOREACH(peer, &sc->sc_peers, p_entry) { wg_queue_purge(&peer->p_stage_queue); wg_timers_disable(peer); } wg_queue_purge(&sc->sc_handshake_queue); TAILQ_FOREACH(peer, &sc->sc_peers, p_entry) { noise_remote_handshake_clear(peer->p_remote); noise_remote_keypairs_clear(peer->p_remote); } if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); wg_socket_uninit(sc); sx_xunlock(&sc->sc_lock); } static int wg_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, struct ifnet **ifpp) { struct wg_softc *sc; if_t ifp; sc = malloc(sizeof(*sc), M_WG, M_WAITOK | M_ZERO); sc->sc_local = noise_local_alloc(sc); sc->sc_encrypt = mallocarray(sizeof(struct grouptask), mp_ncpus, M_WG, M_WAITOK | M_ZERO); sc->sc_decrypt = mallocarray(sizeof(struct grouptask), mp_ncpus, M_WG, M_WAITOK | M_ZERO); if (!rn_inithead((void **)&sc->sc_aip4, offsetof(struct aip_addr, in) * NBBY)) goto free_decrypt; if (!rn_inithead((void **)&sc->sc_aip6, offsetof(struct aip_addr, in6) * NBBY)) goto free_aip4; atomic_add_int(&clone_count, 1); ifp = sc->sc_ifp = if_alloc(IFT_WIREGUARD); sc->sc_ucred = crhold(curthread->td_ucred); sc->sc_socket.so_fibnum = curthread->td_proc->p_fibnum; sc->sc_socket.so_port = 0; TAILQ_INIT(&sc->sc_peers); sc->sc_peers_num = 0; cookie_checker_init(&sc->sc_cookie); RADIX_NODE_HEAD_LOCK_INIT(sc->sc_aip4); RADIX_NODE_HEAD_LOCK_INIT(sc->sc_aip6); GROUPTASK_INIT(&sc->sc_handshake, 0, (gtask_fn_t *)wg_softc_handshake_receive, sc); taskqgroup_attach(qgroup_wg_tqg, &sc->sc_handshake, sc, NULL, NULL, "wg tx initiation"); wg_queue_init(&sc->sc_handshake_queue, "hsq"); for (int i = 0; i < mp_ncpus; i++) { GROUPTASK_INIT(&sc->sc_encrypt[i], 0, (gtask_fn_t *)wg_softc_encrypt, sc); taskqgroup_attach_cpu(qgroup_wg_tqg, &sc->sc_encrypt[i], sc, i, NULL, NULL, "wg encrypt"); GROUPTASK_INIT(&sc->sc_decrypt[i], 0, (gtask_fn_t *)wg_softc_decrypt, sc); taskqgroup_attach_cpu(qgroup_wg_tqg, &sc->sc_decrypt[i], sc, i, NULL, NULL, "wg decrypt"); } wg_queue_init(&sc->sc_encrypt_parallel, "encp"); wg_queue_init(&sc->sc_decrypt_parallel, "decp"); sx_init(&sc->sc_lock, "wg softc lock"); if_setsoftc(ifp, sc); if_setcapabilities(ifp, WG_CAPS); if_setcapenable(ifp, WG_CAPS); if_initname(ifp, wgname, ifd->unit); if_setmtu(ifp, DEFAULT_MTU); if_setflags(ifp, IFF_NOARP | IFF_MULTICAST); if_setinitfn(ifp, wg_init); if_setreassignfn(ifp, wg_reassign); if_setqflushfn(ifp, wg_qflush); if_settransmitfn(ifp, wg_transmit); if_setoutputfn(ifp, wg_output); if_setioctlfn(ifp, wg_ioctl); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(uint32_t)); #ifdef INET6 ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; ND_IFINFO(ifp)->flags |= ND6_IFF_NO_DAD; #endif sx_xlock(&wg_sx); LIST_INSERT_HEAD(&wg_list, sc, sc_entry); sx_xunlock(&wg_sx); *ifpp = ifp; return (0); free_aip4: RADIX_NODE_HEAD_DESTROY(sc->sc_aip4); free(sc->sc_aip4, M_RTABLE); free_decrypt: free(sc->sc_decrypt, M_WG); free(sc->sc_encrypt, M_WG); noise_local_free(sc->sc_local, NULL); free(sc, M_WG); return (ENOMEM); } static void wg_clone_deferred_free(struct noise_local *l) { struct wg_softc *sc = noise_local_arg(l); free(sc, M_WG); atomic_add_int(&clone_count, -1); } static int wg_clone_destroy(struct if_clone *ifc, if_t ifp, uint32_t flags) { struct wg_softc *sc = if_getsoftc(ifp); struct ucred *cred; sx_xlock(&wg_sx); if_setsoftc(ifp, NULL); sx_xlock(&sc->sc_lock); sc->sc_flags |= WGF_DYING; cred = sc->sc_ucred; sc->sc_ucred = NULL; sx_xunlock(&sc->sc_lock); LIST_REMOVE(sc, sc_entry); sx_xunlock(&wg_sx); if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); CURVNET_SET(if_getvnet(sc->sc_ifp)); if_purgeaddrs(sc->sc_ifp); CURVNET_RESTORE(); sx_xlock(&sc->sc_lock); wg_socket_uninit(sc); sx_xunlock(&sc->sc_lock); /* * No guarantees that all traffic have passed until the epoch has * elapsed with the socket closed. */ NET_EPOCH_WAIT(); taskqgroup_drain_all(qgroup_wg_tqg); sx_xlock(&sc->sc_lock); wg_peer_destroy_all(sc); NET_EPOCH_DRAIN_CALLBACKS(); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); taskqgroup_detach(qgroup_wg_tqg, &sc->sc_handshake); for (int i = 0; i < mp_ncpus; i++) { taskqgroup_detach(qgroup_wg_tqg, &sc->sc_encrypt[i]); taskqgroup_detach(qgroup_wg_tqg, &sc->sc_decrypt[i]); } free(sc->sc_encrypt, M_WG); free(sc->sc_decrypt, M_WG); wg_queue_deinit(&sc->sc_handshake_queue); wg_queue_deinit(&sc->sc_encrypt_parallel); wg_queue_deinit(&sc->sc_decrypt_parallel); RADIX_NODE_HEAD_DESTROY(sc->sc_aip4); RADIX_NODE_HEAD_DESTROY(sc->sc_aip6); rn_detachhead((void **)&sc->sc_aip4); rn_detachhead((void **)&sc->sc_aip6); cookie_checker_free(&sc->sc_cookie); if (cred != NULL) crfree(cred); bpfdetach(sc->sc_ifp); if_detach(sc->sc_ifp); if_free(sc->sc_ifp); noise_local_free(sc->sc_local, wg_clone_deferred_free); return (0); } static void wg_qflush(if_t ifp __unused) { } /* * Privileged information (private-key, preshared-key) are only exported for * root and jailed root by default. */ static bool wgc_privileged(struct wg_softc *sc) { struct thread *td; td = curthread; return (priv_check(td, PRIV_NET_WG) == 0); } static void wg_reassign(if_t ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct wg_softc *sc; sc = if_getsoftc(ifp); wg_down(sc); } static void wg_init(void *xsc) { struct wg_softc *sc; sc = xsc; wg_up(sc); } static void vnet_wg_init(const void *unused __unused) { struct if_clone_addreq req = { .create_f = wg_clone_create, .destroy_f = wg_clone_destroy, .flags = IFC_F_AUTOUNIT, }; V_wg_cloner = ifc_attach_cloner(wgname, &req); } VNET_SYSINIT(vnet_wg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_wg_init, NULL); static void vnet_wg_uninit(const void *unused __unused) { if (V_wg_cloner) ifc_detach_cloner(V_wg_cloner); } VNET_SYSUNINIT(vnet_wg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_wg_uninit, NULL); static int wg_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; struct wg_softc *sc; /* * Do a pass through all if_wg interfaces and release creds on any from * the jail that are supposed to be going away. This will, in turn, let * the jail die so that we don't end up with Schrödinger's jail. */ sx_slock(&wg_sx); LIST_FOREACH(sc, &wg_list, sc_entry) { sx_xlock(&sc->sc_lock); if (!(sc->sc_flags & WGF_DYING) && sc->sc_ucred && sc->sc_ucred->cr_prison == pr) { struct ucred *cred = sc->sc_ucred; DPRINTF(sc, "Creating jail exiting\n"); if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); wg_socket_uninit(sc); sc->sc_ucred = NULL; crfree(cred); sc->sc_flags |= WGF_DYING; } sx_xunlock(&sc->sc_lock); } sx_sunlock(&wg_sx); return (0); } #ifdef SELFTESTS #include "selftest/allowedips.c" static bool wg_run_selftests(void) { bool ret = true; ret &= wg_allowedips_selftest(); ret &= noise_counter_selftest(); ret &= cookie_selftest(); return ret; } #else static inline bool wg_run_selftests(void) { return true; } #endif static int wg_module_init(void) { int ret; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = wg_prison_remove, }; wg_packet_zone = uma_zcreate("wg packet", sizeof(struct wg_packet), NULL, NULL, NULL, NULL, 0, 0); ret = crypto_init(); if (ret != 0) return (ret); ret = cookie_init(); if (ret != 0) return (ret); wg_osd_jail_slot = osd_jail_register(NULL, methods); if (!wg_run_selftests()) return (ENOTRECOVERABLE); return (0); } static void wg_module_deinit(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { struct if_clone *clone = VNET_VNET(vnet_iter, wg_cloner); if (clone) { ifc_detach_cloner(clone); VNET_VNET(vnet_iter, wg_cloner) = NULL; } } VNET_LIST_RUNLOCK(); NET_EPOCH_WAIT(); MPASS(LIST_EMPTY(&wg_list)); if (wg_osd_jail_slot != 0) osd_jail_deregister(wg_osd_jail_slot); cookie_deinit(); crypto_deinit(); if (wg_packet_zone != NULL) uma_zdestroy(wg_packet_zone); } static int wg_module_event_handler(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return wg_module_init(); case MOD_UNLOAD: wg_module_deinit(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t wg_moduledata = { "if_wg", wg_module_event_handler, NULL }; DECLARE_MODULE(if_wg, wg_moduledata, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_wg, WIREGUARD_VERSION); MODULE_DEPEND(if_wg, crypto, 1, 1, 1); diff --git a/sys/net/if_disc.c b/sys/net/if_disc.c index 02f3bbbfdaf1..193bb31d138f 100644 --- a/sys/net/if_disc.c +++ b/sys/net/if_disc.c @@ -1,246 +1,246 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if_loop.c 8.1 (Berkeley) 6/10/93 */ /* * Discard interface driver for protocol testing and timing. * (Based on the loopback.) */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_inet.h" #include "opt_inet6.h" #ifdef TINY_DSMTU #define DSMTU (1024+512) #else #define DSMTU 65532 #endif struct disc_softc { struct ifnet *sc_ifp; }; static int discoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int discioctl(struct ifnet *, u_long, caddr_t); static int disc_clone_create(struct if_clone *, int, caddr_t); static void disc_clone_destroy(struct ifnet *); static const char discname[] = "disc"; static MALLOC_DEFINE(M_DISC, discname, "Discard interface"); VNET_DEFINE_STATIC(struct if_clone *, disc_cloner); #define V_disc_cloner VNET(disc_cloner) static int disc_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct ifnet *ifp; struct disc_softc *sc; sc = malloc(sizeof(struct disc_softc), M_DISC, M_WAITOK | M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_LOOP); if (ifp == NULL) { free(sc, M_DISC); return (ENOSPC); } ifp->if_softc = sc; if_initname(ifp, discname, unit); ifp->if_mtu = DSMTU; /* * IFF_LOOPBACK should not be removed from disc's flags because * it controls what PF-specific routes are magically added when * a network address is assigned to the interface. Things just * won't work as intended w/o such routes because the output * interface selection for a packet is totally route-driven. * A valid alternative to IFF_LOOPBACK can be IFF_BROADCAST or * IFF_POINTOPOINT, but it would result in different properties * of the interface. */ ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; ifp->if_drv_flags = IFF_DRV_RUNNING; ifp->if_ioctl = discioctl; ifp->if_output = discoutput; ifp->if_hdrlen = 0; ifp->if_addrlen = 0; ifp->if_snd.ifq_maxlen = 20; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); return (0); } static void disc_clone_destroy(struct ifnet *ifp) { struct disc_softc *sc; sc = ifp->if_softc; bpfdetach(ifp); if_detach(ifp); if_free(ifp); free(sc, M_DISC); } static void vnet_disc_init(const void *unused __unused) { V_disc_cloner = if_clone_simple(discname, disc_clone_create, disc_clone_destroy, 0); } VNET_SYSINIT(vnet_disc_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_disc_init, NULL); static void vnet_disc_uninit(const void *unused __unused) { if_clone_detach(V_disc_cloner); } VNET_SYSUNINIT(vnet_disc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_disc_uninit, NULL); static int disc_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t disc_mod = { "if_disc", disc_modevent, NULL }; DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int discoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int32_t af; M_ASSERTPKTHDR(m); /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); m->m_pkthdr.rcvif = ifp; if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); m_freem(m); return (0); } /* * Process an ioctl request. */ static int discioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; /* * Everything else is done at a higher level. */ break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifr->ifr_addr.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: error = EAFNOSUPPORT; break; } break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; default: error = EINVAL; } return (error); } diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index e5065889d732..ef64c15074ed 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -1,725 +1,726 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #ifndef INET #include #endif #include #include #include #include #endif /* INET6 */ #include #include #include #include #include static const char gifname[] = "gif"; MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static struct sx gif_ioctl_sx; SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl"); void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); void (*ng_gif_attach_p)(struct ifnet *ifp); void (*ng_gif_detach_p)(struct ifnet *ifp); #ifdef VIMAGE static void gif_reassign(struct ifnet *, struct vnet *, char *); #endif static void gif_delete_tunnel(struct gif_softc *); static int gif_ioctl(struct ifnet *, u_long, caddr_t); static int gif_transmit(struct ifnet *, struct mbuf *); static void gif_qflush(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, gif_cloner); #define V_gif_cloner VNET(gif_cloner) SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Generic Tunnel Interface"); #ifndef MAX_GIF_NEST /* * This macro controls the default upper limitation on nesting of gif tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gif tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GIF_NEST 1 #endif VNET_DEFINE_STATIC(int, max_gif_nesting) = MAX_GIF_NEST; #define V_max_gif_nesting VNET(max_gif_nesting) SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels"); static int gif_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); sc->gif_fibnum = curthread->td_proc->p_fibnum; GIF2IFP(sc) = if_alloc(IFT_GIF); GIF2IFP(sc)->if_softc = sc; if_initname(GIF2IFP(sc), gifname, unit); GIF2IFP(sc)->if_addrlen = 0; GIF2IFP(sc)->if_mtu = GIF_MTU; GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; GIF2IFP(sc)->if_ioctl = gif_ioctl; GIF2IFP(sc)->if_transmit = gif_transmit; GIF2IFP(sc)->if_qflush = gif_qflush; GIF2IFP(sc)->if_output = gif_output; #ifdef VIMAGE GIF2IFP(sc)->if_reassign = gif_reassign; #endif GIF2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GIF2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GIF2IFP(sc)); bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); if (ng_gif_attach_p != NULL) (*ng_gif_attach_p)(GIF2IFP(sc)); return (0); } #ifdef VIMAGE static void gif_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct gif_softc *sc; sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; if (sc != NULL) gif_delete_tunnel(sc); sx_xunlock(&gif_ioctl_sx); } #endif /* VIMAGE */ static void gif_clone_destroy(struct ifnet *ifp) { struct gif_softc *sc; sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; gif_delete_tunnel(sc); if (ng_gif_detach_p != NULL) (*ng_gif_detach_p)(ifp); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gif_ioctl_sx); GIF_WAIT(); if_free(ifp); free(sc, M_GIF); } static void vnet_gif_init(const void *unused __unused) { V_gif_cloner = if_clone_simple(gifname, gif_clone_create, gif_clone_destroy, 0); #ifdef INET in_gif_init(); #endif #ifdef INET6 in6_gif_init(); #endif } VNET_SYSINIT(vnet_gif_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_gif_init, NULL); static void vnet_gif_uninit(const void *unused __unused) { if_clone_detach(V_gif_cloner); #ifdef INET in_gif_uninit(); #endif #ifdef INET6 in6_gif_uninit(); #endif } VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_gif_uninit, NULL); static int gifmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gif_mod = { "if_gif", gifmodevent, 0 }; DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gif, 1); struct gif_list * gif_hashinit(void) { struct gif_list *hash; int i; hash = malloc(sizeof(struct gif_list) * GIF_HASH_SIZE, M_GIF, M_WAITOK); for (i = 0; i < GIF_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } void gif_hashdestroy(struct gif_list *hash) { free(hash, M_GIF); } #define MTAG_GIF 1080679712 static int gif_transmit(struct ifnet *ifp, struct mbuf *m) { struct gif_softc *sc; struct etherip_header *eth; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif uint32_t af; uint8_t proto, ecn; int error; NET_EPOCH_ASSERT(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto err; } #endif error = ENETDOWN; sc = ifp->if_softc; if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->gif_family == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_GIF, V_max_gif_nesting)) != 0) { m_freem(m); goto err; } /* Now pull back the af that we stashed in the csum_data. */ if (ifp->if_bridge) af = AF_LINK; else af = m->m_pkthdr.csum_data; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->gif_fibnum); BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); /* inner AF-specific encapsulation */ ecn = 0; switch (af) { #ifdef INET case AF_INET: proto = IPPROTO_IPV4; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto err; } ip = mtod(m, struct ip *); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos); break; #endif #ifdef INET6 case AF_INET6: proto = IPPROTO_IPV6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { error = ENOBUFS; goto err; } t = 0; ip6 = mtod(m, struct ip6_hdr *); ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow); ecn = (ntohl(t) >> 20) & 0xff; break; #endif case AF_LINK: proto = IPPROTO_ETHERIP; M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto err; } eth = mtod(m, struct etherip_header *); eth->eip_resvh = 0; eth->eip_ver = ETHERIP_VERSION; eth->eip_resvl = 0; break; default: error = EAFNOSUPPORT; m_freem(m); goto err; } /* XXX should we check if our outer source is legal? */ /* dispatch to output logic based on outer AF */ switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_output(ifp, m, proto, ecn); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_output(ifp, m, proto, ecn); break; #endif default: m_freem(m); } err: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static void gif_qflush(struct ifnet *ifp __unused) { } int gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; KASSERT(ifp->if_bridge == NULL, ("%s: unexpectedly called with bridge attached", __func__)); - if (dst->sa_family == AF_UNSPEC) + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) memcpy(&af, dst->sa_data, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gif_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } void gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn) { struct etherip_header *eip; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif struct ether_header *eh; struct ifnet *oldifp; int isr, n, af; NET_EPOCH_ASSERT(); if (ifp == NULL) { /* just in case */ m_freem(m); return; } m->m_pkthdr.rcvif = ifp; m_clrprotoflags(m); switch (proto) { #ifdef INET case IPPROTO_IPV4: af = AF_INET; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) goto drop; ip = mtod(m, struct ip *); if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos) == 0) { m_freem(m); goto drop; } break; #endif #ifdef INET6 case IPPROTO_IPV6: af = AF_INET6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) goto drop; t = htonl((uint32_t)ecn << 20); ip6 = mtod(m, struct ip6_hdr *); if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow) == 0) { m_freem(m); goto drop; } break; #endif case IPPROTO_ETHERIP: af = AF_LINK; break; default: m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (bpf_peers_present(ifp->if_bpf)) { uint32_t af1 = af; bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); } if ((ifp->if_flags & IFF_MONITOR) != 0) { if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return; } if (ng_gif_input_p != NULL) { (*ng_gif_input_p)(ifp, &m, af); if (m == NULL) goto drop; } /* * Put the packet to the network layer input queue according to the * specified address family. * Note: older versions of gif_input directly called network layer * input functions, e.g. ip6_input, here. We changed the policy to * prevent too many recursive calls of such input functions, which * might cause kernel panic. But the change may introduce another * problem; if the input queue is full, packets are discarded. * The kernel stack overflow really happened, and we believed * queue-full rarely occurs, so we changed the policy. */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif case AF_LINK: n = sizeof(struct etherip_header) + sizeof(struct ether_header); if (n > m->m_len) m = m_pullup(m, n); if (m == NULL) goto drop; eip = mtod(m, struct etherip_header *); if (eip->eip_ver != ETHERIP_VERSION) { /* discard unknown versions */ m_freem(m); goto drop; } m_adj_decap(m, sizeof(struct etherip_header)); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = ifp; if (ifp->if_bridge) { oldifp = ifp; eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } BRIDGE_INPUT(ifp, m); if (m != NULL && ifp != oldifp) { /* * The bridge gave us back itself or one of the * members for which the frame is addressed. */ ether_demux(ifp, m); return; } } if (m != NULL) m_freem(m); return; default: if (ng_gif_input_orphan_p != NULL) (*ng_gif_input_orphan_p)(ifp, m, af); else m_freem(m); return; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } static int gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq*)data; struct gif_softc *sc; u_int options; int error; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMTU: case SIOCSIFFLAGS: return (0); case SIOCSIFMTU: if (ifr->ifr_mtu < GIF_MTU_MIN || ifr->ifr_mtu > GIF_MTU_MAX) return (EINVAL); else ifp->if_mtu = ifr->ifr_mtu; return (0); } sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto bad; } error = 0; switch (cmd) { case SIOCDIFPHYADDR: if (sc->gif_family == 0) break; gif_delete_tunnel(sc); break; #ifdef INET case SIOCSIFPHYADDR: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = in_gif_ioctl(sc, cmd, data); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = in6_gif_ioctl(sc, cmd, data); break; #endif case SIOCGTUNFIB: ifr->ifr_fib = sc->gif_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gif_fibnum = ifr->ifr_fib; break; case GIFGOPTS: options = sc->gif_options; error = copyout(&options, ifr_data_get_ptr(ifr), sizeof(options)); break; case GIFSOPTS: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; error = copyin(ifr_data_get_ptr(ifr), &options, sizeof(options)); if (error) break; if (options & ~GIF_OPTMASK) { error = EINVAL; break; } if (sc->gif_options != options) { switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_setopts(sc, options); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_setopts(sc, options); break; #endif default: /* No need to invoke AF-handler */ sc->gif_options = options; } } break; default: error = EINVAL; break; } if (error == 0 && sc->gif_family != 0) { if ( #ifdef INET cmd == SIOCSIFPHYADDR || #endif #ifdef INET6 cmd == SIOCSIFPHYADDR_IN6 || #endif 0) { if_link_state_change(ifp, LINK_STATE_UP); } } bad: sx_xunlock(&gif_ioctl_sx); return (error); } static void gif_delete_tunnel(struct gif_softc *sc) { sx_assert(&gif_ioctl_sx, SA_XLOCKED); if (sc->gif_family != 0) { CK_LIST_REMOVE(sc, srchash); CK_LIST_REMOVE(sc, chain); /* Wait until it become safe to free gif_hdr */ GIF_WAIT(); free(sc->gif_hdr, M_GIF); } sc->gif_family = 0; GIF2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GIF2IFP(sc), LINK_STATE_DOWN); } diff --git a/sys/net/if_gre.c b/sys/net/if_gre.c index 55163416f807..ca9c4835daf6 100644 --- a/sys/net/if_gre.c +++ b/sys/net/if_gre.c @@ -1,831 +1,832 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * IPv6-over-GRE contributed by Gert Doering * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #ifdef RSS #include #endif #endif #ifdef INET6 #include #include #include #ifdef RSS #include #endif #endif #include #include #include #include #include #include #define GREMTU 1476 static const char grename[] = "gre"; MALLOC_DEFINE(M_GRE, grename, "Generic Routing Encapsulation"); static struct sx gre_ioctl_sx; SX_SYSINIT(gre_ioctl_sx, &gre_ioctl_sx, "gre_ioctl"); static int gre_clone_create(struct if_clone *, int, caddr_t); static void gre_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, gre_cloner); #define V_gre_cloner VNET(gre_cloner) #ifdef VIMAGE static void gre_reassign(struct ifnet *, struct vnet *, char *); #endif static void gre_qflush(struct ifnet *); static int gre_transmit(struct ifnet *, struct mbuf *); static int gre_ioctl(struct ifnet *, u_long, caddr_t); static int gre_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void gre_delete_tunnel(struct gre_softc *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Generic Routing Encapsulation"); #ifndef MAX_GRE_NEST /* * This macro controls the default upper limitation on nesting of gre tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gre tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GRE_NEST 1 #endif VNET_DEFINE_STATIC(int, max_gre_nesting) = MAX_GRE_NEST; #define V_max_gre_nesting VNET(max_gre_nesting) SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_gre_nesting), 0, "Max nested tunnels"); static void vnet_gre_init(const void *unused __unused) { V_gre_cloner = if_clone_simple(grename, gre_clone_create, gre_clone_destroy, 0); #ifdef INET in_gre_init(); #endif #ifdef INET6 in6_gre_init(); #endif } VNET_SYSINIT(vnet_gre_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_init, NULL); static void vnet_gre_uninit(const void *unused __unused) { if_clone_detach(V_gre_cloner); #ifdef INET in_gre_uninit(); #endif #ifdef INET6 in6_gre_uninit(); #endif /* XXX: epoch_call drain */ } VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_uninit, NULL); static int gre_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gre_softc *sc; sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK | M_ZERO); sc->gre_fibnum = curthread->td_proc->p_fibnum; GRE2IFP(sc) = if_alloc(IFT_TUNNEL); GRE2IFP(sc)->if_softc = sc; if_initname(GRE2IFP(sc), grename, unit); GRE2IFP(sc)->if_mtu = GREMTU; GRE2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; GRE2IFP(sc)->if_output = gre_output; GRE2IFP(sc)->if_ioctl = gre_ioctl; GRE2IFP(sc)->if_transmit = gre_transmit; GRE2IFP(sc)->if_qflush = gre_qflush; #ifdef VIMAGE GRE2IFP(sc)->if_reassign = gre_reassign; #endif GRE2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GRE2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GRE2IFP(sc)); bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t)); return (0); } #ifdef VIMAGE static void gre_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct gre_softc *sc; sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; if (sc != NULL) gre_delete_tunnel(sc); sx_xunlock(&gre_ioctl_sx); } #endif /* VIMAGE */ static void gre_clone_destroy(struct ifnet *ifp) { struct gre_softc *sc; sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; gre_delete_tunnel(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gre_ioctl_sx); GRE_WAIT(); if_free(ifp); free(sc, M_GRE); } static int gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct gre_softc *sc; uint32_t opt; int error; switch (cmd) { case SIOCSIFMTU: /* XXX: */ if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); case GRESADDRS: case GRESADDRD: case GREGADDRS: case GREGADDRD: case GRESPROTO: case GREGPROTO: return (EOPNOTSUPP); } sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCDIFPHYADDR: if (sc->gre_family == 0) break; gre_delete_tunnel(sc); break; #ifdef INET case SIOCSIFPHYADDR: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = in_gre_ioctl(sc, cmd, data); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = in6_gre_ioctl(sc, cmd, data); break; #endif case SIOCGTUNFIB: ifr->ifr_fib = sc->gre_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gre_fibnum = ifr->ifr_fib; break; case GRESKEY: case GRESOPTS: case GRESPORT: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, sizeof(opt))) != 0) break; if (cmd == GRESKEY) { if (sc->gre_key == opt) break; } else if (cmd == GRESOPTS) { if (opt & ~GRE_OPTMASK) { error = EINVAL; break; } if (sc->gre_options == opt) break; } else if (cmd == GRESPORT) { if (opt != 0 && (opt < V_ipport_hifirstauto || opt > V_ipport_hilastauto)) { error = EINVAL; break; } if (sc->gre_port == opt) break; if ((sc->gre_options & GRE_UDPENCAP) == 0) { /* * UDP encapsulation is not enabled, thus * there is no need to reattach softc. */ sc->gre_port = opt; break; } } switch (sc->gre_family) { #ifdef INET case AF_INET: error = in_gre_setopts(sc, cmd, opt); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_setopts(sc, cmd, opt); break; #endif default: /* * Tunnel is not yet configured. * We can just change any parameters. */ if (cmd == GRESKEY) sc->gre_key = opt; if (cmd == GRESOPTS) sc->gre_options = opt; if (cmd == GRESPORT) sc->gre_port = opt; break; } /* * XXX: Do we need to initiate change of interface * state here? */ break; case GREGKEY: error = copyout(&sc->gre_key, ifr_data_get_ptr(ifr), sizeof(sc->gre_key)); break; case GREGOPTS: error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr), sizeof(sc->gre_options)); break; case GREGPORT: error = copyout(&sc->gre_port, ifr_data_get_ptr(ifr), sizeof(sc->gre_port)); break; default: error = EINVAL; break; } if (error == 0 && sc->gre_family != 0) { if ( #ifdef INET cmd == SIOCSIFPHYADDR || #endif #ifdef INET6 cmd == SIOCSIFPHYADDR_IN6 || #endif 0) { if_link_state_change(ifp, LINK_STATE_UP); } } end: sx_xunlock(&gre_ioctl_sx); return (error); } static void gre_delete_tunnel(struct gre_softc *sc) { struct gre_socket *gs; sx_assert(&gre_ioctl_sx, SA_XLOCKED); if (sc->gre_family != 0) { CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); free(sc->gre_hdr, M_GRE); sc->gre_family = 0; } /* * If this Tunnel was the last one that could use UDP socket, * we should unlink socket from hash table and close it. */ if ((gs = sc->gre_so) != NULL && CK_LIST_EMPTY(&gs->list)) { CK_LIST_REMOVE(gs, chain); soclose(gs->so); NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx); sc->gre_so = NULL; } GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GRE2IFP(sc), LINK_STATE_DOWN); } struct gre_list * gre_hashinit(void) { struct gre_list *hash; int i; hash = malloc(sizeof(struct gre_list) * GRE_HASH_SIZE, M_GRE, M_WAITOK); for (i = 0; i < GRE_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } void gre_hashdestroy(struct gre_list *hash) { free(hash, M_GRE); } void gre_sofree(epoch_context_t ctx) { struct gre_socket *gs; gs = __containerof(ctx, struct gre_socket, epoch_ctx); free(gs, M_GRE); } static __inline uint16_t gre_cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } void gre_update_udphdr(struct gre_softc *sc, struct udphdr *udp, uint16_t csum) { sx_assert(&gre_ioctl_sx, SA_XLOCKED); MPASS(sc->gre_options & GRE_UDPENCAP); udp->uh_dport = htons(GRE_UDPPORT); udp->uh_sport = htons(sc->gre_port); udp->uh_sum = csum; udp->uh_ulen = 0; } void gre_update_hdr(struct gre_softc *sc, struct grehdr *gh) { uint32_t *opts; uint16_t flags; sx_assert(&gre_ioctl_sx, SA_XLOCKED); flags = 0; opts = gh->gre_opts; if (sc->gre_options & GRE_ENABLE_CSUM) { flags |= GRE_FLAGS_CP; sc->gre_hlen += 2 * sizeof(uint16_t); *opts++ = 0; } if (sc->gre_key != 0) { flags |= GRE_FLAGS_KP; sc->gre_hlen += sizeof(uint32_t); *opts++ = htonl(sc->gre_key); } if (sc->gre_options & GRE_ENABLE_SEQ) { flags |= GRE_FLAGS_SP; sc->gre_hlen += sizeof(uint32_t); *opts++ = 0; } else sc->gre_oseq = 0; gh->gre_flags = htons(flags); } int gre_input(struct mbuf *m, int off, int proto, void *arg) { struct gre_softc *sc = arg; struct grehdr *gh; struct ifnet *ifp; uint32_t *opts; #ifdef notyet uint32_t key; #endif uint16_t flags; int hlen, isr, af; ifp = GRE2IFP(sc); hlen = off + sizeof(struct grehdr) + 4 * sizeof(uint32_t); if (m->m_pkthdr.len < hlen) goto drop; if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto drop; } gh = (struct grehdr *)mtodo(m, off); flags = ntohs(gh->gre_flags); if (flags & ~GRE_FLAGS_MASK) goto drop; opts = gh->gre_opts; hlen = 2 * sizeof(uint16_t); if (flags & GRE_FLAGS_CP) { /* reserved1 field must be zero */ if (((uint16_t *)opts)[1] != 0) goto drop; if (in_cksum_skip(m, m->m_pkthdr.len, off) != 0) goto drop; hlen += 2 * sizeof(uint16_t); opts++; } if (flags & GRE_FLAGS_KP) { #ifdef notyet /* * XXX: The current implementation uses the key only for outgoing * packets. But we can check the key value here, or even in the * encapcheck function. */ key = ntohl(*opts); #endif hlen += sizeof(uint32_t); opts++; } #ifdef notyet } else key = 0; if (sc->gre_key != 0 && (key != sc->gre_key || key != 0)) goto drop; #endif if (flags & GRE_FLAGS_SP) { #ifdef notyet seq = ntohl(*opts); #endif hlen += sizeof(uint32_t); } switch (ntohs(gh->gre_proto)) { case ETHERTYPE_WCCP: /* * For WCCP skip an additional 4 bytes if after GRE header * doesn't follow an IP header. */ if (flags == 0 && (*(uint8_t *)gh->gre_opts & 0xF0) != 0x40) hlen += sizeof(uint32_t); /* FALLTHROUGH */ case ETHERTYPE_IP: isr = NETISR_IP; af = AF_INET; break; case ETHERTYPE_IPV6: isr = NETISR_IPV6; af = AF_INET6; break; default: goto drop; } m_adj(m, off + hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(isr, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return (IPPROTO_DONE); } static int gre_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; - if (dst->sa_family == AF_UNSPEC) + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gre_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } static void gre_setseqn(struct grehdr *gh, uint32_t seq) { uint32_t *opts; uint16_t flags; opts = gh->gre_opts; flags = ntohs(gh->gre_flags); KASSERT((flags & GRE_FLAGS_SP) != 0, ("gre_setseqn called, but GRE_FLAGS_SP isn't set ")); if (flags & GRE_FLAGS_CP) opts++; if (flags & GRE_FLAGS_KP) opts++; *opts = htonl(seq); } static uint32_t gre_flowid(struct gre_softc *sc, struct mbuf *m, uint32_t af) { uint32_t flowid = 0; if ((sc->gre_options & GRE_UDPENCAP) == 0 || sc->gre_port != 0) return (flowid); switch (af) { #ifdef INET case AF_INET: #ifdef RSS flowid = rss_hash_ip4_2tuple(mtod(m, struct ip *)->ip_src, mtod(m, struct ip *)->ip_dst); break; #endif flowid = mtod(m, struct ip *)->ip_src.s_addr ^ mtod(m, struct ip *)->ip_dst.s_addr; break; #endif #ifdef INET6 case AF_INET6: #ifdef RSS flowid = rss_hash_ip6_2tuple( &mtod(m, struct ip6_hdr *)->ip6_src, &mtod(m, struct ip6_hdr *)->ip6_dst); break; #endif flowid = mtod(m, struct ip6_hdr *)->ip6_src.s6_addr32[3] ^ mtod(m, struct ip6_hdr *)->ip6_dst.s6_addr32[3]; break; #endif default: break; } return (flowid); } #define MTAG_GRE 1307983903 static int gre_transmit(struct ifnet *ifp, struct mbuf *m) { GRE_RLOCK_TRACKER; struct gre_softc *sc; struct grehdr *gh; struct udphdr *uh; uint32_t af, flowid; int error, len; uint16_t proto; len = 0; GRE_RLOCK(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto drop; } #endif error = ENETDOWN; sc = ifp->if_softc; if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->gre_family == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_GRE, V_max_gre_nesting)) != 0) { m_freem(m); goto drop; } af = m->m_pkthdr.csum_data; BPF_MTAP2(ifp, &af, sizeof(af), m); m->m_flags &= ~(M_BCAST|M_MCAST); flowid = gre_flowid(sc, m, af); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } bcopy(sc->gre_hdr, mtod(m, void *), sc->gre_hlen); /* Determine GRE proto */ switch (af) { #ifdef INET case AF_INET: proto = htons(ETHERTYPE_IP); break; #endif #ifdef INET6 case AF_INET6: proto = htons(ETHERTYPE_IPV6); break; #endif default: m_freem(m); error = ENETDOWN; goto drop; } /* Determine offset of GRE header */ switch (sc->gre_family) { #ifdef INET case AF_INET: len = sizeof(struct ip); break; #endif #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr); break; #endif default: m_freem(m); error = ENETDOWN; goto drop; } if (sc->gre_options & GRE_UDPENCAP) { uh = (struct udphdr *)mtodo(m, len); uh->uh_sport |= htons(V_ipport_hifirstauto) | (flowid >> 16) | (flowid & 0xFFFF); uh->uh_sport = htons(ntohs(uh->uh_sport) % V_ipport_hilastauto); uh->uh_ulen = htons(m->m_pkthdr.len - len); uh->uh_sum = gre_cksum_add(uh->uh_sum, htons(m->m_pkthdr.len - len + IPPROTO_UDP)); m->m_pkthdr.csum_flags = sc->gre_csumflags; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); len += sizeof(struct udphdr); } gh = (struct grehdr *)mtodo(m, len); gh->gre_proto = proto; if (sc->gre_options & GRE_ENABLE_SEQ) gre_setseqn(gh, sc->gre_oseq++); if (sc->gre_options & GRE_ENABLE_CSUM) { *(uint16_t *)gh->gre_opts = in_cksum_skip(m, m->m_pkthdr.len, len); } len = m->m_pkthdr.len - len; switch (sc->gre_family) { #ifdef INET case AF_INET: error = in_gre_output(m, af, sc->gre_hlen); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_output(m, af, sc->gre_hlen, flowid); break; #endif default: m_freem(m); error = ENETDOWN; } drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); } GRE_RUNLOCK(); return (error); } static void gre_qflush(struct ifnet *ifp __unused) { } static int gremodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gre_mod = { "if_gre", gremodevent, 0 }; DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gre, 1); diff --git a/sys/net/if_me.c b/sys/net/if_me.c index e9bcd345b5c5..80c2816b808a 100644 --- a/sys/net/if_me.c +++ b/sys/net/if_me.c @@ -1,688 +1,689 @@ /*- * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MEMTU (1500 - sizeof(struct mobhdr)) static const char mename[] = "me"; static MALLOC_DEFINE(M_IFME, mename, "Minimal Encapsulation for IP"); /* Minimal forwarding header RFC 2004 */ struct mobhdr { uint8_t mob_proto; /* protocol */ uint8_t mob_flags; /* flags */ #define MOB_FLAGS_SP 0x80 /* source present */ uint16_t mob_csum; /* header checksum */ struct in_addr mob_dst; /* original destination address */ struct in_addr mob_src; /* original source addr (optional) */ } __packed; struct me_softc { struct ifnet *me_ifp; u_int me_fibnum; struct in_addr me_src; struct in_addr me_dst; CK_LIST_ENTRY(me_softc) chain; CK_LIST_ENTRY(me_softc) srchash; }; CK_LIST_HEAD(me_list, me_softc); #define ME2IFP(sc) ((sc)->me_ifp) #define ME_READY(sc) ((sc)->me_src.s_addr != 0) #define ME_RLOCK_TRACKER struct epoch_tracker me_et #define ME_RLOCK() epoch_enter_preempt(net_epoch_preempt, &me_et) #define ME_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &me_et) #define ME_WAIT() epoch_wait_preempt(net_epoch_preempt) #ifndef ME_HASH_SIZE #define ME_HASH_SIZE (1 << 4) #endif VNET_DEFINE_STATIC(struct me_list *, me_hashtbl) = NULL; VNET_DEFINE_STATIC(struct me_list *, me_srchashtbl) = NULL; #define V_me_hashtbl VNET(me_hashtbl) #define V_me_srchashtbl VNET(me_srchashtbl) #define ME_HASH(src, dst) (V_me_hashtbl[\ me_hashval((src), (dst)) & (ME_HASH_SIZE - 1)]) #define ME_SRCHASH(src) (V_me_srchashtbl[\ fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (ME_HASH_SIZE - 1)]) static struct sx me_ioctl_sx; SX_SYSINIT(me_ioctl_sx, &me_ioctl_sx, "me_ioctl"); static int me_clone_create(struct if_clone *, int, caddr_t); static void me_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, me_cloner); #define V_me_cloner VNET(me_cloner) #ifdef VIMAGE static void me_reassign(struct ifnet *, struct vnet *, char *); #endif static void me_qflush(struct ifnet *); static int me_transmit(struct ifnet *, struct mbuf *); static int me_ioctl(struct ifnet *, u_long, caddr_t); static int me_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int me_input(struct mbuf *, int, int, void *); static int me_set_tunnel(struct me_softc *, in_addr_t, in_addr_t); static void me_delete_tunnel(struct me_softc *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, me, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Minimal Encapsulation for IP (RFC 2004)"); #ifndef MAX_ME_NEST #define MAX_ME_NEST 1 #endif VNET_DEFINE_STATIC(int, max_me_nesting) = MAX_ME_NEST; #define V_max_me_nesting VNET(max_me_nesting) SYSCTL_INT(_net_link_me, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_me_nesting), 0, "Max nested tunnels"); static uint32_t me_hashval(in_addr_t src, in_addr_t dst) { uint32_t ret; ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); return (fnv_32_buf(&dst, sizeof(dst), ret)); } static struct me_list * me_hashinit(void) { struct me_list *hash; int i; hash = malloc(sizeof(struct me_list) * ME_HASH_SIZE, M_IFME, M_WAITOK); for (i = 0; i < ME_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } static void vnet_me_init(const void *unused __unused) { V_me_cloner = if_clone_simple(mename, me_clone_create, me_clone_destroy, 0); } VNET_SYSINIT(vnet_me_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_init, NULL); static void vnet_me_uninit(const void *unused __unused) { if (V_me_hashtbl != NULL) { free(V_me_hashtbl, M_IFME); V_me_hashtbl = NULL; ME_WAIT(); free(V_me_srchashtbl, M_IFME); } if_clone_detach(V_me_cloner); } VNET_SYSUNINIT(vnet_me_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_me_uninit, NULL); static int me_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct me_softc *sc; sc = malloc(sizeof(struct me_softc), M_IFME, M_WAITOK | M_ZERO); sc->me_fibnum = curthread->td_proc->p_fibnum; ME2IFP(sc) = if_alloc(IFT_TUNNEL); ME2IFP(sc)->if_softc = sc; if_initname(ME2IFP(sc), mename, unit); ME2IFP(sc)->if_mtu = MEMTU; ME2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; ME2IFP(sc)->if_output = me_output; ME2IFP(sc)->if_ioctl = me_ioctl; ME2IFP(sc)->if_transmit = me_transmit; ME2IFP(sc)->if_qflush = me_qflush; #ifdef VIMAGE ME2IFP(sc)->if_reassign = me_reassign; #endif ME2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; ME2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(ME2IFP(sc)); bpfattach(ME2IFP(sc), DLT_NULL, sizeof(u_int32_t)); return (0); } #ifdef VIMAGE static void me_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, char *unused __unused) { struct me_softc *sc; sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; if (sc != NULL) me_delete_tunnel(sc); sx_xunlock(&me_ioctl_sx); } #endif /* VIMAGE */ static void me_clone_destroy(struct ifnet *ifp) { struct me_softc *sc; sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; me_delete_tunnel(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&me_ioctl_sx); ME_WAIT(); if_free(ifp); free(sc, M_IFME); } static int me_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *src, *dst; struct me_softc *sc; int error; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); } sx_xlock(&me_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: src = &((struct in_aliasreq *)data)->ifra_addr; dst = &((struct in_aliasreq *)data)->ifra_dstaddr; if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } error = me_set_tunnel(sc, src->sin_addr.s_addr, dst->sin_addr.s_addr); break; case SIOCDIFPHYADDR: me_delete_tunnel(sc); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (!ME_READY(sc)) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); switch (cmd) { case SIOCGIFPSRCADDR: src->sin_addr = sc->me_src; break; case SIOCGIFPDSTADDR: src->sin_addr = sc->me_dst; break; } error = prison_if(curthread->td_ucred, sintosa(src)); if (error != 0) memset(src, 0, sizeof(*src)); break; case SIOCGTUNFIB: ifr->ifr_fib = sc->me_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_ME)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->me_fibnum = ifr->ifr_fib; break; default: error = EINVAL; break; } end: sx_xunlock(&me_ioctl_sx); return (error); } static int me_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct me_softc *sc; if (V_me_hashtbl == NULL) return (0); NET_EPOCH_ASSERT(); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &ME_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { if (sc->me_src.s_addr == ip->ip_dst.s_addr && sc->me_dst.s_addr == ip->ip_src.s_addr) { if ((ME2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } /* * Check that ingress address belongs to local host. */ static void me_set_running(struct me_softc *sc) { if (in_localip(sc->me_src)) ME2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; else ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * ifaddr_event handler. * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent * source address spoofing. */ static void me_srcaddr(void *arg __unused, const struct sockaddr *sa, int event __unused) { const struct sockaddr_in *sin; struct me_softc *sc; /* Check that VNET is ready */ if (V_me_hashtbl == NULL) return; NET_EPOCH_ASSERT(); sin = (const struct sockaddr_in *)sa; CK_LIST_FOREACH(sc, &ME_SRCHASH(sin->sin_addr.s_addr), srchash) { if (sc->me_src.s_addr != sin->sin_addr.s_addr) continue; me_set_running(sc); } } static int me_set_tunnel(struct me_softc *sc, in_addr_t src, in_addr_t dst) { struct epoch_tracker et; struct me_softc *tmp; sx_assert(&me_ioctl_sx, SA_XLOCKED); if (V_me_hashtbl == NULL) { V_me_hashtbl = me_hashinit(); V_me_srchashtbl = me_hashinit(); } if (sc->me_src.s_addr == src && sc->me_dst.s_addr == dst) return (0); CK_LIST_FOREACH(tmp, &ME_HASH(src, dst), chain) { if (tmp == sc) continue; if (tmp->me_src.s_addr == src && tmp->me_dst.s_addr == dst) return (EADDRNOTAVAIL); } me_delete_tunnel(sc); sc->me_dst.s_addr = dst; sc->me_src.s_addr = src; CK_LIST_INSERT_HEAD(&ME_HASH(src, dst), sc, chain); CK_LIST_INSERT_HEAD(&ME_SRCHASH(src), sc, srchash); NET_EPOCH_ENTER(et); me_set_running(sc); NET_EPOCH_EXIT(et); if_link_state_change(ME2IFP(sc), LINK_STATE_UP); return (0); } static void me_delete_tunnel(struct me_softc *sc) { sx_assert(&me_ioctl_sx, SA_XLOCKED); if (ME_READY(sc)) { CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); ME_WAIT(); sc->me_src.s_addr = 0; sc->me_dst.s_addr = 0; ME2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(ME2IFP(sc), LINK_STATE_DOWN); } } static uint16_t me_in_cksum(uint16_t *p, int nwords) { uint32_t sum = 0; while (nwords-- > 0) sum += *p++; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); return (~sum); } static int me_input(struct mbuf *m, int off, int proto, void *arg) { struct me_softc *sc = arg; struct mobhdr *mh; struct ifnet *ifp; struct ip *ip; int hlen; NET_EPOCH_ASSERT(); ifp = ME2IFP(sc); /* checks for short packets */ hlen = sizeof(struct mobhdr); if (m->m_pkthdr.len < sizeof(struct ip) + hlen) hlen -= sizeof(struct in_addr); if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) goto drop; mh = (struct mobhdr *)mtodo(m, sizeof(struct ip)); /* check for wrong flags */ if (mh->mob_flags & (~MOB_FLAGS_SP)) { m_freem(m); goto drop; } if (mh->mob_flags) { if (hlen != sizeof(struct mobhdr)) { m_freem(m); goto drop; } } else hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); /* check mobile header checksum */ if (me_in_cksum((uint16_t *)mh, hlen / sizeof(uint16_t)) != 0) { m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif ip = mtod(m, struct ip *); ip->ip_dst = mh->mob_dst; ip->ip_p = mh->mob_proto; ip->ip_sum = 0; ip->ip_len = htons(m->m_pkthdr.len - hlen); if (mh->mob_flags) ip->ip_src = mh->mob_src; memmove(mtodo(m, hlen), ip, sizeof(struct ip)); m_adj(m, hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); M_SETFIB(m, ifp->if_fib); hlen = AF_INET; BPF_MTAP2(ifp, &hlen, sizeof(hlen), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(NETISR_IP, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (IPPROTO_DONE); } static int me_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; - if (dst->sa_family == AF_UNSPEC) + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } #define MTAG_ME 1414491977 static int me_transmit(struct ifnet *ifp, struct mbuf *m) { ME_RLOCK_TRACKER; struct mobhdr mh; struct me_softc *sc; struct ip *ip; uint32_t af; int error, hlen, plen; ME_RLOCK(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error != 0) goto drop; #endif error = ENETDOWN; sc = ifp->if_softc; if (sc == NULL || !ME_READY(sc) || (ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_ME, V_max_me_nesting)) != 0) { m_freem(m); goto drop; } af = m->m_pkthdr.csum_data; if (af != AF_INET) { error = EAFNOSUPPORT; m_freem(m); goto drop; } if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto drop; } ip = mtod(m, struct ip *); /* Fragmented datagramms shouldn't be encapsulated */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { error = EINVAL; m_freem(m); goto drop; } mh.mob_proto = ip->ip_p; mh.mob_src = ip->ip_src; mh.mob_dst = ip->ip_dst; if (in_hosteq(sc->me_src, ip->ip_src)) { hlen = sizeof(struct mobhdr) - sizeof(struct in_addr); mh.mob_flags = 0; } else { hlen = sizeof(struct mobhdr); mh.mob_flags = MOB_FLAGS_SP; } BPF_MTAP2(ifp, &af, sizeof(af), m); plen = m->m_pkthdr.len; ip->ip_src = sc->me_src; ip->ip_dst = sc->me_dst; m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->me_fibnum); M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } if (m->m_len < sizeof(struct ip) + hlen) m = m_pullup(m, sizeof(struct ip) + hlen); if (m == NULL) { error = ENOBUFS; goto drop; } memmove(mtod(m, void *), mtodo(m, hlen), sizeof(struct ip)); ip = mtod(m, struct ip *); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_p = IPPROTO_MOBILE; ip->ip_sum = 0; mh.mob_csum = 0; mh.mob_csum = me_in_cksum((uint16_t *)&mh, hlen / sizeof(uint16_t)); bcopy(&mh, mtodo(m, sizeof(struct ip)), hlen); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, plen); } ME_RUNLOCK(); return (error); } static void me_qflush(struct ifnet *ifp __unused) { } static const struct srcaddrtab *me_srcaddrtab = NULL; static const struct encaptab *ecookie = NULL; static const struct encap_config me_encap_cfg = { .proto = IPPROTO_MOBILE, .min_length = sizeof(struct ip) + sizeof(struct mobhdr) - sizeof(in_addr_t), .exact_match = ENCAP_DRV_LOOKUP, .lookup = me_lookup, .input = me_input }; static int memodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: me_srcaddrtab = ip_encap_register_srcaddr(me_srcaddr, NULL, M_WAITOK); ecookie = ip_encap_attach(&me_encap_cfg, NULL, M_WAITOK); break; case MOD_UNLOAD: ip_encap_detach(ecookie); ip_encap_unregister_srcaddr(me_srcaddrtab); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t me_mod = { "if_me", memodevent, 0 }; DECLARE_MODULE(if_me, me_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_me, 1); diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index 4cb219dc92b6..5d37879e87b9 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -1,2083 +1,2083 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #ifdef INET6 #include #include #endif #include #include #include #include #include #include #include #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_alias; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_UNUSED1 0x0008 #define TUN_UNUSED2 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ int tun_busy; /* busy count */ int tun_vhdrlen; /* virtio-net header length */ struct lro_ctrl tun_lro; /* for TCP LRO */ bool tun_lro_ready; /* TCP LRO initialized */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED); #define TUN_VMIO_FLAG_MASK 0x0fff /* * Interface capabilities of a tap device that supports the virtio-net * header. */ #define TAP_VNET_HDR_CAPS (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 \ | IFCAP_VLAN_HWCSUM \ | IFCAP_TSO | IFCAP_LRO \ | IFCAP_VLAN_HWTSO) #define TAP_ALL_OFFLOAD (CSUM_TSO | CSUM_TCP | CSUM_UDP |\ CSUM_TCP_IPV6 | CSUM_UDP_IPV6) /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag arrival_tag; static eventhandler_tag clone_tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user devfs cloning */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP tunnel software network interface"); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation"); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Enable legacy devfs interface creation for all users"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name); static int tun_busy_locked(struct tuntap_softc *tp); static void tun_unbusy_locked(struct tuntap_softc *tp); static int tun_busy(struct tuntap_softc *tp); static void tun_unbusy(struct tuntap_softc *tp); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev); static void tundtor(void *data); static void tunrename(void *arg, struct ifnet *ifp); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); static int tun_clone_create(struct if_clone *, char *, size_t, struct ifc_data *, struct ifnet **); static int tun_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen); static d_open_t tunopen; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; ifc_match_f *clone_match_fn; ifc_create_f *clone_create_fn; ifc_destroy_f *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Mechanism for marking a tunnel device as busy so that we can safely do some * orthogonal operations (such as operations on devices) without racing against * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or * open, to be woken up when the condition is alleviated. */ static int tun_busy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); if ((tp->tun_flags & TUN_DYING) != 0) { /* * Perhaps unintuitive, but the device is busy going away. * Other interpretations of EBUSY from tun_busy make little * sense, since making a busy device even more busy doesn't * sound like a problem. */ return (EBUSY); } ++tp->tun_busy; return (0); } static void tun_unbusy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel")); --tp->tun_busy; /* Wake up anything that may be waiting on our busy tunnel. */ if (tp->tun_busy == 0) cv_broadcast(&tp->tun_cv); } static int tun_busy(struct tuntap_softc *tp) { int ret; TUN_LOCK(tp); ret = tun_busy_locked(tp); TUN_UNLOCK(tp); return (ret); } static void tun_unbusy(struct tuntap_softc *tp) { TUN_LOCK(tp); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, struct ifnet **ifpp) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that's okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ dev = NULL; i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); /* No preexisting struct cdev *, create one */ if (i != 0) i = tun_create_device(drv, unit, NULL, &dev, name); if (i == 0) { dev_ref(dev); tuncreate(dev); struct tuntap_softc *tp = dev->si_drv1; *ifpp = tp->tun_ifp; } return (i); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } i = tun_create_device(drv, u, cred, dev, name); } if (i == 0) { dev_ref(*dev); if_clone_create(name, namelen, NULL); } out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if (tp->tun_busy != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); /* destroy_dev will take care of any alias. */ destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } sx_xlock(&tun_ioctl_sx); TUN2IFP(tp)->if_softc = NULL; sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp, uint32_t flags) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; struct if_clone_addreq req = { .match_f = drv->clone_match_fn, .create_f = drv->clone_create_fn, .destroy_f = drv->clone_destroy_fn, }; drvc->cloner = ifc_attach_cloner(drv->cdevsw.d_name, &req); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static struct tuntap_driver * tuntap_driver_from_ifnet(const struct ifnet *ifp) { struct tuntap_driver *drv; int i; if (ifp == NULL) return (NULL); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0) return (drv); } return (NULL); } static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, tunrename, 0, 1000); if (arrival_tag == NULL) return (ENOMEM); clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (clone_tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; /* We'll only ever have these two, so no need for a macro. */ static moduledata_t tun_mod = { "if_tun", NULL, 0 }; static moduledata_t tap_mod = { "if_tap", NULL, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tun, 1); DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tap, 1); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name) { struct make_dev_args args; struct tuntap_softc *tp; int error; tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO); mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&tp->tun_cv, "tun_condvar"); tp->tun_flags = drv->ident_flags; tp->tun_drv = drv; make_dev_args_init(&args); if (cr != NULL) args.mda_flags = MAKEDEV_REF | MAKEDEV_CHECKNAME; args.mda_devsw = &drv->cdevsw; args.mda_cr = cr; args.mda_uid = UID_UUCP; args.mda_gid = GID_DIALER; args.mda_mode = 0600; args.mda_unit = unit; args.mda_si_drv1 = tp; error = make_dev_s(&args, dev, "%s", name); if (error != 0) { free(tp, M_TUN); return (error); } KASSERT((*dev)->si_drv1 != NULL, ("Failed to set si_drv1 at %s creation", name)); tp->tun_dev = *dev; knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx); mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); return (0); } static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ static int tap_transmit(struct ifnet *ifp, struct mbuf *m) { int error; BPF_MTAP(ifp, m); IFQ_HANDOFF(ifp, m, error); return (error); } /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev) { struct tuntap_driver *drv; struct tuntap_softc *tp; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); drv = tp->tun_drv; iflags = IFF_MULTICAST; if ((tp->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = tp->tun_ifp = if_alloc(type); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", drv->cdevsw.d_name, dev2unit(dev)); ifp->if_softc = tp; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE; if ((tp->tun_flags & TUN_L2) != 0) ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO; ifp->if_capenable |= IFCAP_LINKSTATE; if ((tp->tun_flags & TUN_L2) != 0) { ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ifp->if_transmit = tap_transmit; ifp->if_qflush = if_qflush; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } TUN_LOCK(tp); tp->tun_flags |= TUN_INITED; TUN_UNLOCK(tp); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static void tunrename(void *arg __unused, struct ifnet *ifp) { struct tuntap_softc *tp; int error; if ((ifp->if_flags & IFF_RENAMING) == 0) return; if (tuntap_driver_from_ifnet(ifp) == NULL) return; /* * We need to grab the ioctl sx long enough to make sure the softc is * still there. If it is, we can safely try to busy the tun device. * The busy may fail if the device is currently dying, in which case * we do nothing. If it doesn't fail, the busy count stops the device * from dying until we've created the alias (that will then be * subsequently destroyed). */ sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { sx_xunlock(&tun_ioctl_sx); return; } error = tun_busy(tp); sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { destroy_dev(tp->tun_alias); tp->tun_alias = NULL; } if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0) goto out; /* * Failure's ok, aliases are created on a best effort basis. If a * tun user/consumer decides to rename the interface to conflict with * another device (non-ifnet) on the system, we will assume they know * what they are doing. make_dev_alias_p won't touch tun_alias on * failure, so we use it but ignore the return value. */ make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s", ifp->if_xname); out: tun_unbusy(tp); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_softc *tp; int error __diagused, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); TUN_LOCK(tp); if ((tp->tun_flags & TUN_INITED) == 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (ENXIO); } if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } error = tun_busy_locked(tp); KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); /* * This can fail with either ENOENT or EBUSY. This is in the middle of * d_open, so ENOENT should not be possible. EBUSY is possible, but * the only cdevpriv dtor being set will be tundtor and the softc being * passed is constant for a given cdev. We ignore the possible error * because of this as either "unlikely" or "not actually a problem." */ (void)devfs_set_cdevpriv(tp, tundtor); CURVNET_RESTORE(); return (0); } /* * tundtor - tear down the device - mark i/f down & delete * routing info */ static void tundtor(void *data) { struct proc *p; struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; tp = data; p = curproc; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Realistically, we can't be obstinate here. This only means that the * tuntap device was closed out of order, and the last closer wasn't the * controller. These are still good to know about, though, as software * should avoid multiple processes with a tuntap device open and * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in * parent). */ if (p->p_pid != tp->tun_pid) { log(LOG_INFO, "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n", p->p_pid, p->p_comm, tp->tun_dev->si_name); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; #if defined(INET) || defined(INET6) if (l2tun && tp->tun_lro_ready) { TUNDEBUG (ifp, "LRO disabled\n"); tcp_lro_free(&tp->tun_lro); tp->tun_lro_ready = false; } #endif if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; tun_vnethdr_set(ifp, 0); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); TUN_UNLOCK(tp); } else { #if defined(INET) || defined(INET6) if (tcp_lro_init(&tp->tun_lro) == 0) { TUNDEBUG(ifp, "LRO enabled\n"); tp->tun_lro.ifp = ifp; tp->tun_lro_ready = true; } else { TUNDEBUG(ifp, "Could not enable LRO\n"); tp->tun_lro_ready = false; } #endif ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } /* * To be called under TUN_LOCK. Update ifp->if_hwassist according to the * current value of ifp->if_capenable. */ static void tun_caps_changed(struct ifnet *ifp) { uint64_t hwassist = 0; TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc); if (ifp->if_capenable & IFCAP_TXCSUM) hwassist |= CSUM_TCP | CSUM_UDP; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6; if (ifp->if_capenable & IFCAP_TSO4) hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) hwassist |= CSUM_IP6_TSO; ifp->if_hwassist = hwassist; } /* * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust * if_capabilities and if_capenable as needed. */ static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen) { struct tuntap_softc *tp = ifp->if_softc; TUN_LOCK_ASSERT(tp); if (tp->tun_vhdrlen == vhdrlen) return; /* * Update if_capabilities to reflect the * functionalities offered by the virtio-net * header. */ if (vhdrlen != 0) ifp->if_capabilities |= TAP_VNET_HDR_CAPS; else ifp->if_capabilities &= ~TAP_VNET_HDR_CAPS; /* * Disable any capabilities that we don't * support anymore. */ ifp->if_capenable &= ifp->if_capabilities; tun_caps_changed(ifp); tp->tun_vhdrlen = vhdrlen; TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n", vhdrlen, ifp->if_capabilities); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER | IFM_FDX | IFM_1000_T; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; case SIOCSIFCAP: TUN_LOCK(tp); ifp->if_capenable = ifr->ifr_reqcap; tun_caps_changed(ifp); TUN_UNLOCK(tp); VLAN_CAPABILITIES(ifp); break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); BPF_MTAP2(ifp, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct tuninfo *tunp; int error, iflags, ival; bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); ifp->if_flags = iflags | (ifp->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case TAPSVNETHDR: ival = *(int *)data; if (ival != 0 && ival != sizeof(struct virtio_net_hdr) && ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) { return (EINVAL); } TUN_LOCK(tp); tun_vnethdr_set(ifp, ival); TUN_UNLOCK(tp); return (0); case TAPGVNETHDR: TUN_LOCK(tp); *(int *)data = tp->tun_vhdrlen; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { case TUNGIFNAME: ifrp = (struct ifreq *)data; strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); return (0); case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; size_t len; int error = 0; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m != NULL) break; if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } TUN_UNLOCK(tp); len = min(tp->tun_vhdrlen, uio->uio_resid); if (len > 0) { struct virtio_net_hdr_mrg_rxbuf vhdr; bzero(&vhdr, sizeof(vhdr)); if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) { m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr); } TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); error = uiomove(&vhdr, len, uio); } while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } static int tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m, struct virtio_net_hdr_mrg_rxbuf *vhdr) { struct epoch_tracker et; struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if ((ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } if (vhdr != NULL) { if (virtio_net_rx_csum(m, &vhdr->hdr)) { m_freem(m); return (0); } } else { switch (ntohs(eh->ether_type)) { #ifdef INET case ETHERTYPE_IP: if (ifp->if_capenable & IFCAP_RXCSUM) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_SCTP_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } break; #endif #ifdef INET6 case ETHERTYPE_IPV6: if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_SCTP_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } break; #endif } } /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); NET_EPOCH_ENTER(et); #if defined(INET) || defined(INET6) if (tp->tun_lro_ready && ifp->if_capenable & IFCAP_LRO && tcp_lro_rx(&tp->tun_lro, m, 0) == 0) tcp_lro_flush_all(&tp->tun_lro); else #endif (*ifp->if_input)(ifp, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { struct epoch_tracker et; struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct virtio_net_hdr_mrg_rxbuf vhdr; struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; int align, vhdrlen, error; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; mru = l2tun ? TAPMRU : TUNMRU; vhdrlen = tp->tun_vhdrlen; align = 0; if (l2tun) { align = ETHER_ALIGN; mru += vhdrlen; } else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if (vhdrlen > 0) { error = uiomove(&vhdr, vhdrlen, uio); if (error != 0) return (error); TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); } if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } diff --git a/sys/netgraph/ng_iface.c b/sys/netgraph/ng_iface.c index 8ae4707b7abd..e9f97ff0fdec 100644 --- a/sys/netgraph/ng_iface.c +++ b/sys/netgraph/ng_iface.c @@ -1,817 +1,817 @@ /* * ng_iface.c */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Archie Cobbs * $Whistle: ng_iface.c,v 1.33 1999/11/01 09:24:51 julian Exp $ */ /* * This node is also a system networking interface. It has * a hook for each protocol (IP, AppleTalk, etc). Packets * are simply relayed between the interface and the hooks. * * Interfaces are named ng0, ng1, etc. New nodes take the * first available interface name. * * This node also includes Berkeley packet filter support. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_IFACE, "netgraph_iface", "netgraph iface node"); #else #define M_NETGRAPH_IFACE M_NETGRAPH #endif static SYSCTL_NODE(_net_graph, OID_AUTO, iface, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Point to point netgraph interface"); VNET_DEFINE_STATIC(int, ng_iface_max_nest) = 2; #define V_ng_iface_max_nest VNET(ng_iface_max_nest) SYSCTL_INT(_net_graph_iface, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ng_iface_max_nest), 0, "Max nested tunnels"); /* This struct describes one address family */ struct iffam { sa_family_t family; /* Address family */ const char *hookname; /* Name for hook */ }; typedef const struct iffam *iffam_p; /* List of address families supported by our interface */ const static struct iffam gFamilies[] = { { AF_INET, NG_IFACE_HOOK_INET }, { AF_INET6, NG_IFACE_HOOK_INET6 }, }; #define NUM_FAMILIES nitems(gFamilies) /* Node private data */ struct ng_iface_private { struct ifnet *ifp; /* Our interface */ int unit; /* Interface unit number */ node_p node; /* Our netgraph node */ hook_p hooks[NUM_FAMILIES]; /* Hook for each address family */ struct rmlock lock; /* Protect private data changes */ }; typedef struct ng_iface_private *priv_p; #define PRIV_RLOCK(priv, t) rm_rlock(&priv->lock, t) #define PRIV_RUNLOCK(priv, t) rm_runlock(&priv->lock, t) #define PRIV_WLOCK(priv) rm_wlock(&priv->lock) #define PRIV_WUNLOCK(priv) rm_wunlock(&priv->lock) /* Interface methods */ static void ng_iface_start(struct ifnet *ifp); static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ng_iface_output(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro); static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family); static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, sa_family_t sa); #ifdef DEBUG static void ng_iface_print_ioctl(struct ifnet *ifp, int cmd, caddr_t data); #endif /* Netgraph methods */ static int ng_iface_mod_event(module_t, int, void *); static ng_constructor_t ng_iface_constructor; static ng_rcvmsg_t ng_iface_rcvmsg; static ng_shutdown_t ng_iface_shutdown; static ng_newhook_t ng_iface_newhook; static ng_rcvdata_t ng_iface_rcvdata; static ng_disconnect_t ng_iface_disconnect; /* Helper stuff */ static iffam_p get_iffam_from_af(sa_family_t family); static iffam_p get_iffam_from_hook(priv_p priv, hook_p hook); static iffam_p get_iffam_from_name(const char *name); static hook_p *get_hook_from_iffam(priv_p priv, iffam_p iffam); /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_iface_cmds[] = { { NGM_IFACE_COOKIE, NGM_IFACE_GET_IFNAME, "getifname", NULL, &ng_parse_string_type }, { NGM_IFACE_COOKIE, NGM_IFACE_POINT2POINT, "point2point", NULL, NULL }, { NGM_IFACE_COOKIE, NGM_IFACE_BROADCAST, "broadcast", NULL, NULL }, { NGM_IFACE_COOKIE, NGM_IFACE_GET_IFINDEX, "getifindex", NULL, &ng_parse_uint32_type }, { 0 } }; /* Node type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_IFACE_NODE_TYPE, .mod_event = ng_iface_mod_event, .constructor = ng_iface_constructor, .rcvmsg = ng_iface_rcvmsg, .shutdown = ng_iface_shutdown, .newhook = ng_iface_newhook, .rcvdata = ng_iface_rcvdata, .disconnect = ng_iface_disconnect, .cmdlist = ng_iface_cmds, }; NETGRAPH_INIT(iface, &typestruct); VNET_DEFINE_STATIC(struct unrhdr *, ng_iface_unit); #define V_ng_iface_unit VNET(ng_iface_unit) /************************************************************************ HELPER STUFF ************************************************************************/ /* * Get the family descriptor from the family ID */ static __inline iffam_p get_iffam_from_af(sa_family_t family) { iffam_p iffam; int k; for (k = 0; k < NUM_FAMILIES; k++) { iffam = &gFamilies[k]; if (iffam->family == family) return (iffam); } return (NULL); } /* * Get the family descriptor from the hook */ static __inline iffam_p get_iffam_from_hook(priv_p priv, hook_p hook) { int k; for (k = 0; k < NUM_FAMILIES; k++) if (priv->hooks[k] == hook) return (&gFamilies[k]); return (NULL); } /* * Get the hook from the iffam descriptor */ static __inline hook_p * get_hook_from_iffam(priv_p priv, iffam_p iffam) { return (&priv->hooks[iffam - gFamilies]); } /* * Get the iffam descriptor from the name */ static __inline iffam_p get_iffam_from_name(const char *name) { iffam_p iffam; int k; for (k = 0; k < NUM_FAMILIES; k++) { iffam = &gFamilies[k]; if (!strcmp(iffam->hookname, name)) return (iffam); } return (NULL); } /************************************************************************ INTERFACE STUFF ************************************************************************/ /* * Process an ioctl for the virtual interface */ static int ng_iface_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifreq *const ifr = (struct ifreq *) data; int error = 0; #ifdef DEBUG ng_iface_print_ioctl(ifp, command, data); #endif switch (command) { /* These two are mostly handled at a higher layer */ case SIOCSIFADDR: ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); break; case SIOCGIFADDR: break; /* Set flags */ case SIOCSIFFLAGS: /* * If the interface is marked up and stopped, then start it. * If it is marked down and running, then stop it. */ if (ifr->ifr_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE); ifp->if_drv_flags |= IFF_DRV_RUNNING; } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } break; /* Set the interface MTU */ case SIOCSIFMTU: if (ifr->ifr_mtu > NG_IFACE_MTU_MAX || ifr->ifr_mtu < NG_IFACE_MTU_MIN) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; break; /* Stuff that's not supported */ case SIOCADDMULTI: case SIOCDELMULTI: error = 0; break; case SIOCSIFPHYS: error = EOPNOTSUPP; break; default: error = EINVAL; break; } return (error); } /* * This routine is called to deliver a packet out the interface. * We simply look at the address family and relay the packet to * the corresponding hook, if it exists and is connected. */ static int ng_iface_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; int error; /* Check interface flags */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { m_freem(m); return (ENETDOWN); } /* Protect from deadly infinite recursion. */ error = if_tunnel_check_nesting(ifp, m, NGM_IFACE_COOKIE, V_ng_iface_max_nest); if (error) { m_freem(m); return (error); } /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); /* Berkeley packet filter */ ng_iface_bpftap(ifp, m, af); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { M_PREPEND(m, sizeof(sa_family_t), M_NOWAIT); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return (ENOBUFS); } *(sa_family_t *)m->m_data = af; error = (ifp->if_transmit)(ifp, m); } else error = ng_iface_send(ifp, m, af); return (error); } /* * Start method is used only when ALTQ is enabled. */ static void ng_iface_start(struct ifnet *ifp) { struct mbuf *m; sa_family_t sa; KASSERT(ALTQ_IS_ENABLED(&ifp->if_snd), ("%s without ALTQ", __func__)); for(;;) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; sa = *mtod(m, sa_family_t *); m_adj(m, sizeof(sa_family_t)); ng_iface_send(ifp, m, sa); } } /* * Flash a packet by the BPF (requires prepending 4 byte AF header) * Note the phoney mbuf; this is OK because BPF treats it read-only. */ static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family) { KASSERT(family != AF_UNSPEC, ("%s: family=AF_UNSPEC", __func__)); if (bpf_peers_present(ifp->if_bpf)) { int32_t family4 = (int32_t)family; bpf_mtap2(ifp->if_bpf, &family4, sizeof(family4), m); } } /* * This routine does actual delivery of the packet into the * netgraph(4). It is called from ng_iface_start() and * ng_iface_output(). */ static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, sa_family_t sa) { struct rm_priotracker priv_tracker; const priv_p priv = (priv_p) ifp->if_softc; const iffam_p iffam = get_iffam_from_af(sa); hook_p hook; int error; int len; /* Check address family to determine hook (if known) */ if (iffam == NULL) { m_freem(m); log(LOG_WARNING, "%s: can't handle af%d\n", ifp->if_xname, sa); return (EAFNOSUPPORT); } /* Copy length before the mbuf gets invalidated. */ len = m->m_pkthdr.len; PRIV_RLOCK(priv, &priv_tracker); hook = *get_hook_from_iffam(priv, iffam); if (hook == NULL) { NG_FREE_M(m); PRIV_RUNLOCK(priv, &priv_tracker); return ENETDOWN; } NG_HOOK_REF(hook); PRIV_RUNLOCK(priv, &priv_tracker); NG_OUTBOUND_THREAD_REF(); NG_SEND_DATA_ONLY(error, hook, m); NG_OUTBOUND_THREAD_UNREF(); NG_HOOK_UNREF(hook); /* Update stats. */ if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); } return (error); } #ifdef DEBUG /* * Display an ioctl to the virtual interface */ static void ng_iface_print_ioctl(struct ifnet *ifp, int command, caddr_t data) { char *str; switch (command & IOC_DIRMASK) { case IOC_VOID: str = "IO"; break; case IOC_OUT: str = "IOR"; break; case IOC_IN: str = "IOW"; break; case IOC_INOUT: str = "IORW"; break; default: str = "IO??"; } log(LOG_DEBUG, "%s: %s('%c', %d, char[%d])\n", ifp->if_xname, str, IOCGROUP(command), command & 0xff, IOCPARM_LEN(command)); } #endif /* DEBUG */ /************************************************************************ NETGRAPH NODE STUFF ************************************************************************/ /* * Constructor for a node */ static int ng_iface_constructor(node_p node) { struct ifnet *ifp; priv_p priv; /* Allocate node and interface private structures */ priv = malloc(sizeof(*priv), M_NETGRAPH_IFACE, M_WAITOK | M_ZERO); ifp = if_alloc(IFT_PROPVIRTUAL); if (ifp == NULL) { free(priv, M_NETGRAPH_IFACE); return (ENOMEM); } rm_init(&priv->lock, "ng_iface private rmlock"); /* Link them together */ ifp->if_softc = priv; priv->ifp = ifp; /* Get an interface unit number */ priv->unit = alloc_unr(V_ng_iface_unit); /* Link together node and private info */ NG_NODE_SET_PRIVATE(node, priv); priv->node = node; /* Initialize interface structure */ if_initname(ifp, NG_IFACE_IFACE_NAME, priv->unit); ifp->if_output = ng_iface_output; ifp->if_start = ng_iface_start; ifp->if_ioctl = ng_iface_ioctl; ifp->if_mtu = NG_IFACE_MTU_DEFAULT; ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST); ifp->if_type = IFT_PROPVIRTUAL; /* XXX */ ifp->if_addrlen = 0; /* XXX */ ifp->if_hdrlen = 0; /* XXX */ ifp->if_baudrate = 64000; /* XXX */ IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; IFQ_SET_READY(&ifp->if_snd); /* Give this node the same name as the interface (if possible) */ if (ng_name_node(node, ifp->if_xname) != 0) log(LOG_WARNING, "%s: can't acquire netgraph name\n", ifp->if_xname); /* Attach the interface */ if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); /* Done */ return (0); } /* * Give our ok for a hook to be added */ static int ng_iface_newhook(node_p node, hook_p hook, const char *name) { const iffam_p iffam = get_iffam_from_name(name); const priv_p priv = NG_NODE_PRIVATE(node); hook_p *hookptr; if (iffam == NULL) return (EPFNOSUPPORT); PRIV_WLOCK(priv); hookptr = get_hook_from_iffam(priv, iffam); if (*hookptr != NULL) { PRIV_WUNLOCK(priv); return (EISCONN); } *hookptr = hook; NG_HOOK_HI_STACK(hook); NG_HOOK_SET_TO_INBOUND(hook); PRIV_WUNLOCK(priv); return (0); } /* * Receive a control message */ static int ng_iface_rcvmsg(node_p node, item_p item, hook_p lasthook) { const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *const ifp = priv->ifp; struct ng_mesg *resp = NULL; int error = 0; struct ng_mesg *msg; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_IFACE_COOKIE: switch (msg->header.cmd) { case NGM_IFACE_GET_IFNAME: NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } strlcpy(resp->data, ifp->if_xname, IFNAMSIZ); break; case NGM_IFACE_POINT2POINT: case NGM_IFACE_BROADCAST: { /* Deny request if interface is UP */ if ((ifp->if_flags & IFF_UP) != 0) return (EBUSY); /* Change flags */ switch (msg->header.cmd) { case NGM_IFACE_POINT2POINT: ifp->if_flags |= IFF_POINTOPOINT; ifp->if_flags &= ~IFF_BROADCAST; break; case NGM_IFACE_BROADCAST: ifp->if_flags &= ~IFF_POINTOPOINT; ifp->if_flags |= IFF_BROADCAST; break; } break; } case NGM_IFACE_GET_IFINDEX: NG_MKRESPONSE(resp, msg, sizeof(uint32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((uint32_t *)resp->data) = priv->ifp->if_index; break; default: error = EINVAL; break; } break; case NGM_FLOW_COOKIE: switch (msg->header.cmd) { case NGM_LINK_IS_UP: if_link_state_change(ifp, LINK_STATE_UP); break; case NGM_LINK_IS_DOWN: if_link_state_change(ifp, LINK_STATE_DOWN); break; default: break; } break; default: error = EINVAL; break; } NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } /* * Recive data from a hook. Pass the packet to the correct input routine. */ static int ng_iface_rcvdata(hook_p hook, item_p item) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); const iffam_p iffam = get_iffam_from_hook(priv, hook); struct ifnet *const ifp = priv->ifp; struct epoch_tracker et; struct mbuf *m; int isr; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* Sanity checks */ KASSERT(iffam != NULL, ("%s: iffam", __func__)); M_ASSERTPKTHDR(m); if ((ifp->if_flags & IFF_UP) == 0) { NG_FREE_M(m); return (ENETDOWN); } /* Update interface stats */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Note receiving interface */ m->m_pkthdr.rcvif = ifp; /* Berkeley packet filter */ ng_iface_bpftap(ifp, m, iffam->family); /* Send packet */ switch (iffam->family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_NG); M_SETFIB(m, ifp->if_fib); CURVNET_SET(ifp->if_vnet); NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * Shutdown and remove the node and its associated interface. */ static int ng_iface_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); /* * The ifnet may be in a different vnet than the netgraph node, * hence we have to change the current vnet context here. */ CURVNET_SET_QUIET(priv->ifp->if_vnet); bpfdetach(priv->ifp); if_detach(priv->ifp); if_free(priv->ifp); CURVNET_RESTORE(); priv->ifp = NULL; free_unr(V_ng_iface_unit, priv->unit); rm_destroy(&priv->lock); free(priv, M_NETGRAPH_IFACE); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); return (0); } /* * Hook disconnection. Note that we do *not* shutdown when all * hooks have been disconnected. */ static int ng_iface_disconnect(hook_p hook) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); const iffam_p iffam = get_iffam_from_hook(priv, hook); if (iffam == NULL) panic("%s", __func__); PRIV_WLOCK(priv); *get_hook_from_iffam(priv, iffam) = NULL; PRIV_WUNLOCK(priv); return (0); } /* * Handle loading and unloading for this node type. */ static int ng_iface_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: case MOD_UNLOAD: break; default: error = EOPNOTSUPP; break; } return (error); } static void vnet_ng_iface_init(const void *unused) { V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); } VNET_SYSINIT(vnet_ng_iface_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ng_iface_init, NULL); static void vnet_ng_iface_uninit(const void *unused) { delete_unrhdr(V_ng_iface_unit); } VNET_SYSUNINIT(vnet_ng_iface_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_ng_iface_uninit, NULL);