Index: head/sys/net/bpf.c =================================================================== --- head/sys/net/bpf.c (revision 356754) +++ head/sys/net/bpf.c (revision 356755) @@ -1,3091 +1,3090 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2019 Andrey V. Elsukov * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_ddb.h" #include "opt_netgraph.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #ifdef BPF_JITTER #include #endif #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); static struct bpf_if_ext dead_bpf_if = { .bif_dlist = CK_LIST_HEAD_INITIALIZER() }; struct bpf_if { #define bif_next bif_ext.bif_next #define bif_dlist bif_ext.bif_dlist struct bpf_if_ext bif_ext; /* public members */ u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of link header */ struct bpfd_list bif_wlist; /* writer-only list */ struct ifnet *bif_ifp; /* corresponding interface */ struct bpf_if **bif_bpf; /* Pointer to pointer to us */ volatile u_int bif_refcnt; struct epoch_context epoch_ctx; }; CTASSERT(offsetof(struct bpf_if, bif_ext) == 0); struct bpf_program_buffer { struct epoch_context epoch_ctx; #ifdef BPF_JITTER bpf_jit_filter *func; #endif void *buffer[0]; }; #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ #define SIZEOF_BPF_HDR(type) \ (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen)) #ifdef COMPAT_FREEBSD32 #include #include #define BPF_ALIGNMENT32 sizeof(int32_t) #define BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32) #ifndef BURN_BRIDGES /* * 32-bit version of structure prepended to each packet. We use this header * instead of the standard one for 32-bit streams. We mark the a stream as * 32-bit the first time we see a 32-bit compat ioctl request. */ struct bpf_hdr32 { struct timeval32 bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; #endif struct bpf_program32 { u_int bf_len; uint32_t bf_insns; }; struct bpf_dltlist32 { u_int bfl_len; u_int bfl_list; }; #define BIOCSETF32 _IOW('B', 103, struct bpf_program32) #define BIOCSRTIMEOUT32 _IOW('B', 109, struct timeval32) #define BIOCGRTIMEOUT32 _IOR('B', 110, struct timeval32) #define BIOCGDLTLIST32 _IOWR('B', 121, struct bpf_dltlist32) #define BIOCSETWF32 _IOW('B', 123, struct bpf_program32) #define BIOCSETFNR32 _IOW('B', 130, struct bpf_program32) #endif #define BPF_LOCK() sx_xlock(&bpf_sx) #define BPF_UNLOCK() sx_xunlock(&bpf_sx) #define BPF_LOCK_ASSERT() sx_assert(&bpf_sx, SA_XLOCKED) /* * bpf_iflist is a list of BPF interface structures, each corresponding to a * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ CK_LIST_HEAD(bpf_iflist, bpf_if); static struct bpf_iflist bpf_iflist; static struct sx bpf_sx; /* bpf global lock */ static int bpf_bpfd_cnt; static void bpfif_ref(struct bpf_if *); static void bpfif_rele(struct bpf_if *); static void bpfd_ref(struct bpf_d *); static void bpfd_rele(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_detachd_locked(struct bpf_d *, bool); static void bpfd_free(epoch_context_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static void bpf_drvinit(void *); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); static int bpf_zerocopy_enable = 0; SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0; #define V_bpf_optimize_writers VNET(bpf_optimize_writers) SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0, "Do not send packets until BPF program is set"); static d_open_t bpfopen; static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; static d_kqfilter_t bpfkqfilter; static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, .d_open = bpfopen, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, .d_name = "bpf", .d_kqfilter = bpfkqfilter, }; static struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, }; /* * LOCKING MODEL USED BY BPF * * Locks: * 1) global lock (BPF_LOCK). Sx, used to protect some global counters, * every bpf_iflist changes, serializes ioctl access to bpf descriptors. * 2) Descriptor lock. Mutex, used to protect BPF buffers and various * structure fields used by bpf_*tap* code. * * Lock order: global lock, then descriptor lock. * * There are several possible consumers: * * 1. The kernel registers interface pointer with bpfattach(). * Each call allocates new bpf_if structure, references ifnet pointer * and links bpf_if into bpf_iflist chain. This is protected with global * lock. * * 2. An userland application uses ioctl() call to bpf_d descriptor. * All such call are serialized with global lock. BPF filters can be - * changed, but pointer to old filter will be freed using epoch_call(). + * changed, but pointer to old filter will be freed using NET_EPOCH_CALL(). * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to * filter pointers, even if change will happen during bpf_tap execution. - * Destroying of bpf_d descriptor also is doing using epoch_call(). + * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL(). * * 3. An userland application can write packets into bpf_d descriptor. * There we need to be sure, that ifnet won't disappear during bpfwrite(). * * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to * bif_dlist is protected with net_epoch_preempt section. So, it should * be safe to make access to bpf_d descriptor inside the section. * * 5. The kernel invokes bpfdetach() on interface destroying. All lists * are modified with global lock held and actual free() is done using - * epoch_call(). + * NET_EPOCH_CALL(). */ static void bpfif_free(epoch_context_t ctx) { struct bpf_if *bp; bp = __containerof(ctx, struct bpf_if, epoch_ctx); if_rele(bp->bif_ifp); free(bp, M_BPF); } static void bpfif_ref(struct bpf_if *bp) { refcount_acquire(&bp->bif_refcnt); } static void bpfif_rele(struct bpf_if *bp) { if (!refcount_release(&bp->bif_refcnt)) return; - epoch_call(net_epoch_preempt, &bp->epoch_ctx, bpfif_free); + NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx); } static void bpfd_ref(struct bpf_d *d) { refcount_acquire(&d->bd_refcnt); } static void bpfd_rele(struct bpf_d *d) { if (!refcount_release(&d->bd_refcnt)) return; - epoch_call(net_epoch_preempt, &d->epoch_ctx, bpfd_free); + NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx); } static struct bpf_program_buffer* bpf_program_buffer_alloc(size_t size, int flags) { return (malloc(sizeof(struct bpf_program_buffer) + size, M_BPF, flags)); } static void bpf_program_buffer_free(epoch_context_t ctx) { struct bpf_program_buffer *ptr; ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx); #ifdef BPF_JITTER if (ptr->func != NULL) bpf_destroy_jit_filter(ptr->func); #endif free(ptr, M_BPF); } /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure * similar to protosw, et. */ static void bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_bytes(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); default: panic("bpf_buf_append_bytes"); } } static void bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); default: panic("bpf_buf_append_mbuf"); } } /* * This function gets called when the free buffer is re-assigned. */ static void bpf_buf_reclaimed(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return; case BPF_BUFMODE_ZBUF: bpf_zerocopy_buf_reclaimed(d); return; default: panic("bpf_buf_reclaimed"); } } /* * If the buffer mechanism has a way to decide that a held buffer can be made * free, then it is exposed via the bpf_canfreebuf() interface. (1) is * returned if the buffer can be discarded, (0) is returned if it cannot. */ static int bpf_canfreebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canfreebuf(d)); } return (0); } /* * Allow the buffer model to indicate that the current store buffer is * immutable, regardless of the appearance of space. Return (1) if the * buffer is writable, and (0) if not. */ static int bpf_canwritebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canwritebuf(d)); } return (1); } /* * Notify buffer model that an attempt to write to the store buffer has * resulted in a dropped packet, in which case the buffer may be considered * full. */ static void bpf_buffull(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_buffull(d); break; } } /* * Notify the buffer model that a buffer has moved into the hold position. */ void bpf_bufheld(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_bufheld(d); break; } } static void bpf_free(struct bpf_d *d) { switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_free(d)); case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_free(d)); default: panic("bpf_buf_free"); } } static int bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_uiomove(d, buf, len, uio)); } static int bpf_ioctl_sblen(struct bpf_d *d, u_int *i) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_ioctl_sblen(d, i)); } static int bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_getzmax(td, d, i)); } static int bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); } static int bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); } /* * General BPF functions. */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_d *d) { const struct ieee80211_bpf_params *p; struct ether_header *eh; struct mbuf *m; int error; int len; int hlen; int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = ETHER_HDR_LEN; break; case DLT_FDDI: sockp->sa_family = AF_IMPLINK; hlen = 0; break; case DLT_RAW: sockp->sa_family = AF_UNSPEC; hlen = 0; break; case DLT_NULL: /* * null interface types require a 4 byte pseudo header which * corresponds to the address family of the packet. */ sockp->sa_family = AF_UNSPEC; hlen = 4; break; case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. * though it isn't standard, vpi:vci needs to be * specified anyway. */ sockp->sa_family = AF_UNSPEC; hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sockp->sa_family = AF_IEEE80211; hlen = 0; break; case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ sockp->sa_family = AF_IEEE80211; sockp->sa_len = 12; /* XXX != 0 */ hlen = sizeof(struct ieee80211_bpf_params); break; default: return (EIO); } len = uio->uio_resid; if (len < hlen || len - hlen > ifp->if_mtu) return (EMSGSIZE); m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR); if (m == NULL) return (EIO); m->m_pkthdr.len = m->m_len = len; *mp = m; error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } /* Check for multicast destination */ switch (linktype) { case DLT_EN10MB: eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost, ETHER_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; } if (d->bd_hdrcmplt == 0) { memcpy(eh->ether_shost, IF_LLADDR(ifp), sizeof(eh->ether_shost)); } break; } /* * Make room for link header, and copy it to sockaddr */ if (hlen != 0) { if (sockp->sa_family == AF_IEEE80211) { /* * Collect true length from the parameter header * NB: sockp is known to be zero'd so if we do a * short copy unspecified parameters will be * zero. * NB: packet may not be aligned after stripping * bpf params * XXX check ibp_vers */ p = mtod(m, const struct ieee80211_bpf_params *); hlen = p->ibp_len; if (hlen > sizeof(sockp->sa_data)) { error = EINVAL; goto bad; } } bcopy(mtod(m, const void *), sockp->sa_data, hlen); } *hdrlen = hlen; return (0); bad: m_freem(m); return (error); } /* * Attach descriptor to the bpf interface, i.e. make d listen on bp, * then reset its buffers and counters with reset_d(). */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { int op_w; BPF_LOCK_ASSERT(); /* * Save sysctl value to protect from sysctl change * between reads */ op_w = V_bpf_optimize_writers || d->bd_writer; if (d->bd_bif != NULL) bpf_detachd_locked(d, false); /* * Point d at bp, and add d to the interface's list. * Since there are many applications using BPF for * sending raw packets only (dhcpd, cdpd are good examples) * we can delay adding d to the list of active listeners until * some filter is configured. */ BPFD_LOCK(d); /* * Hold reference to bpif while descriptor uses this interface. */ bpfif_ref(bp); d->bd_bif = bp; if (op_w != 0) { /* Add to writers-only list */ CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); /* * We decrement bd_writer on every filter set operation. * First BIOCSETF is done by pcap_open_live() to set up * snap length. After that appliation usually sets its own * filter. */ d->bd_writer = 2; } else CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); reset_d(d); BPFD_UNLOCK(d); bpf_bpfd_cnt++; CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", __func__, d->bd_pid, d->bd_writer ? "writer" : "active"); if (op_w == 0) EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } /* * Check if we need to upgrade our descriptor @d from write-only mode. */ static int bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) { int is_snap, need_upgrade; /* * Check if we've already upgraded or new filter is empty. */ if (d->bd_writer == 0 || fcode == NULL) return (0); need_upgrade = 0; /* * Check if cmd looks like snaplen setting from * pcap_bpf.c:pcap_open_live(). * Note we're not checking .k value here: * while pcap_open_live() definitely sets to non-zero value, * we'd prefer to treat k=0 (deny ALL) case the same way: e.g. * do not consider upgrading immediately */ if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K)) is_snap = 1; else is_snap = 0; if (is_snap == 0) { /* * We're setting first filter and it doesn't look like * setting snaplen. We're probably using bpf directly. * Upgrade immediately. */ need_upgrade = 1; } else { /* * Do not require upgrade by first BIOCSETF * (used to set snaplen) by pcap_open_live(). */ if (--d->bd_writer == 0) { /* * First snaplen filter has already * been set. This is probably catch-all * filter */ need_upgrade = 1; } } CTR5(KTR_NET, "%s: filter function set by pid %d, " "bd_writer counter %d, snap %d upgrade %d", __func__, d->bd_pid, d->bd_writer, is_snap, need_upgrade); return (need_upgrade); } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { BPF_LOCK(); bpf_detachd_locked(d, false); BPF_UNLOCK(); } static void bpf_detachd_locked(struct bpf_d *d, bool detached_ifp) { struct bpf_if *bp; struct ifnet *ifp; int error; BPF_LOCK_ASSERT(); CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); /* Check if descriptor is attached */ if ((bp = d->bd_bif) == NULL) return; BPFD_LOCK(d); /* Remove d from the interface's descriptor list. */ CK_LIST_REMOVE(d, bd_next); /* Save bd_writer value */ error = d->bd_writer; ifp = bp->bif_ifp; d->bd_bif = NULL; if (detached_ifp) { /* * Notify descriptor as it's detached, so that any * sleepers wake up and get ENXIO. */ bpf_wakeup(d); } BPFD_UNLOCK(d); bpf_bpfd_cnt--; /* Call event handler iff d is attached */ if (error == 0) EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); /* * Check if this descriptor had requested promiscuous mode. * If so and ifnet is not detached, turn it off. */ if (d->bd_promisc && !detached_ifp) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); CURVNET_RESTORE(); if (error != 0 && error != ENXIO) { /* * ENXIO can happen if a pccard is unplugged * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. */ if_printf(bp->bif_ifp, "bpf_detach: ifpromisc failed (%d)\n", error); } } bpfif_rele(bp); } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ static void bpf_dtor(void *data) { struct bpf_d *d = data; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); bpf_detachd(d); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpfd_rele(d); } /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. */ /* ARGSUSED */ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; int error; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); if (error != 0) { free(d, M_BPF); return (error); } /* Setup counters */ d->bd_rcount = counter_u64_alloc(M_WAITOK); d->bd_dcount = counter_u64_alloc(M_WAITOK); d->bd_fcount = counter_u64_alloc(M_WAITOK); d->bd_wcount = counter_u64_alloc(M_WAITOK); d->bd_wfcount = counter_u64_alloc(M_WAITOK); d->bd_wdcount = counter_u64_alloc(M_WAITOK); d->bd_zcopy = counter_u64_alloc(M_WAITOK); /* * For historical reasons, perform a one-time initialization call to * the buffer routines, even though we're not yet committed to a * particular buffer method. */ bpf_buffer_init(d); if ((flags & FREAD) == 0) d->bd_writer = 2; d->bd_hbuf_in_use = 0; d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_refcnt = 1; BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); mac_bpfdesc_create(td->td_ucred, d); #endif mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF); callout_init_mtx(&d->bd_callout, &d->bd_lock, 0); knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock); return (0); } /* * bpfread - read next chunk of packets from buffers */ static int bpfread(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; int error; int non_block; int timed_out; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Restrict application to use a buffer the same size as * as kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); non_block = ((ioflag & O_NONBLOCK) != 0); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { BPFD_UNLOCK(d); return (EOPNOTSUPP); } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; while (d->bd_hbuf_in_use) { error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET|PCATCH, "bd_hbuf", 0); if (error != 0) { BPFD_UNLOCK(d); return (error); } } /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ while (d->bd_hbuf == NULL) { if (d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. */ if (d->bd_immediate || non_block || timed_out) { /* * Rotate the buffers and return what's here * if we are in immediate mode, non-blocking * flag is set, or this descriptor timed out. */ ROTATE_BUFFERS(d); break; } } /* * No data is available, check to see if the bpf device * is still pointed at a real interface. If not, return * ENXIO so that the userland process knows to rebind * it before using it again. */ if (d->bd_bif == NULL) { BPFD_UNLOCK(d); return (ENXIO); } if (non_block) { BPFD_UNLOCK(d); return (EWOULDBLOCK); } error = msleep(d, &d->bd_lock, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); return (error); } if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { BPFD_UNLOCK(d); return (0); } ROTATE_BUFFERS(d); break; } } /* * At this point, we know we have something in the hold slot. */ d->bd_hbuf_in_use = 1; BPFD_UNLOCK(d); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. * * We do not have to worry about simultaneous reads because * we waited for sole access to the hold buffer above. */ error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf")); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); d->bd_hbuf_in_use = 0; wakeup(&d->bd_hbuf_in_use); BPFD_UNLOCK(d); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static __inline void bpf_wakeup(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (d->bd_state == BPF_WAITING) { callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; } wakeup(d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); KNOTE_LOCKED(&d->bd_sel.si_note, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = (struct bpf_d *)arg; BPFD_LOCK_ASSERT(d); if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } } static int bpf_ready(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (!bpf_canfreebuf(d) && d->bd_hlen != 0) return (1); if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0) return (1); return (0); } static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct route ro; struct sockaddr dst; struct epoch_tracker et; struct bpf_if *bp; struct bpf_d *d; struct ifnet *ifp; struct mbuf *m, *mc; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); NET_EPOCH_ENTER(et); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); counter_u64_add(d->bd_wcount, 1); if ((bp = d->bd_bif) == NULL) { error = ENXIO; goto out_locked; } ifp = bp->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { error = ENETDOWN; goto out_locked; } if (uio->uio_resid == 0) goto out_locked; bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; /* * Take extra reference, unlock d and exit from epoch section, * since bpf_movein() can sleep. */ bpfd_ref(d); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); error = bpf_movein(uio, (int)bp->bif_dlt, ifp, &m, &dst, &hlen, d); if (error != 0) { counter_u64_add(d->bd_wdcount, 1); bpfd_rele(d); return (error); } BPFD_LOCK(d); /* * Check that descriptor is still attached to the interface. * This can happen on bpfdetach(). To avoid access to detached * ifnet, free mbuf and return ENXIO. */ if (d->bd_bif == NULL) { counter_u64_add(d->bd_wdcount, 1); BPFD_UNLOCK(d); bpfd_rele(d); m_freem(m); return (ENXIO); } counter_u64_add(d->bd_wfcount, 1); if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, M_NOWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* Set M_PROMISC for outgoing packets to be discarded. */ if (d->bd_direction == BPF_D_INOUT) m->m_flags |= M_PROMISC; } else mc = NULL; m->m_pkthdr.len -= hlen; m->m_len -= hlen; m->m_data += hlen; /* XXX */ CURVNET_SET(ifp->if_vnet); #ifdef MAC mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); #endif bzero(&ro, sizeof(ro)); if (hlen != 0) { ro.ro_prepend = (u_char *)&dst.sa_data; ro.ro_plen = hlen; ro.ro_flags = RT_HAS_HEADER; } /* Avoid possible recursion on BPFD_LOCK(). */ NET_EPOCH_ENTER(et); BPFD_UNLOCK(d); error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) counter_u64_add(d->bd_wdcount, 1); if (mc != NULL) { if (error == 0) (*ifp->if_input)(ifp, mc); else m_freem(mc); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); bpfd_rele(d); return (error); out_locked: counter_u64_add(d->bd_wdcount, 1); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); return (error); } /* * Reset a descriptor by flushing its packet buffer and clearing the receive * and drop counts. This is doable for kernel-only buffers, but with * zero-copy buffers, we can't write to (or rotate) buffers that are * currently owned by userspace. It would be nice if we could encapsulate * this logic in the buffer code rather than here. */ static void reset_d(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if ((d->bd_hbuf != NULL) && (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } if (bpf_canwritebuf(d)) d->bd_slen = 0; counter_u64_zero(d->bd_rcount); counter_u64_zero(d->bd_dcount); counter_u64_zero(d->bd_fcount); counter_u64_zero(d->bd_wcount); counter_u64_zero(d->bd_wfcount); counter_u64_zero(d->bd_wdcount); counter_u64_zero(d->bd_zcopy); } /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set read filter. * BIOCSETFNR Set read filter without resetting descriptor. * BIOCSETWF Set write filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag * BIOCGTSTAMP Get time stamp format and resolution. * BIOCSTSTAMP Set time stamp format and resolution. * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. * BIOCSETZBUF Set current zero-copy buffer locations. * BIOCGETZMAX Get maximum zero-copy buffer size. * BIOCROTZBUF Force rotation of zero-copy buffer * BIOCSETBUFMODE Set buffer mode. * BIOCGETBUFMODE Get current buffer mode. */ /* ARGSUSED */ static int bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct bpf_d *d; int error; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); if (d->bd_locked == 1) { switch (cmd) { case BIOCGBLEN: case BIOCFLUSH: case BIOCGDLT: case BIOCGDLTLIST: #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: #endif case BIOCGETIF: case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: case BIOCGHDRCMPLT: case BIOCSTSTAMP: case BIOCFEEDBACK: case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif case BIOCIMMEDIATE: case TIOCGPGRP: case BIOCROTZBUF: break; default: return (EPERM); } } #ifdef COMPAT_FREEBSD32 /* * If we see a 32-bit compat ioctl, mark the stream as 32-bit so * that it will get 32-bit packet headers. */ switch (cmd) { case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: case BIOCGDLTLIST32: case BIOCGRTIMEOUT32: case BIOCSRTIMEOUT32: if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { BPFD_LOCK(d); d->bd_compat32 = 1; BPFD_UNLOCK(d); } } #endif CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; BPFD_LOCK(d); n = d->bd_slen; while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if (d->bd_hbuf) n += d->bd_hlen; BPFD_UNLOCK(d); *(int *)addr = n; break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufsize; BPFD_UNLOCK(d); break; /* * Set buffer length. */ case BIOCSBLEN: error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* * Set link layer read filter. */ case BIOCSETF: case BIOCSETFNR: case BIOCSETWF: #ifdef COMPAT_FREEBSD32 case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: #endif error = bpf_setf(d, (struct bpf_program *)addr, cmd); break; /* * Flush read packet buffer. */ case BIOCFLUSH: BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: if (d->bd_bif == NULL) { /* * No interface attached yet. */ error = EINVAL; break; } if (d->bd_promisc == 0) { error = ifpromisc(d->bd_bif->bif_ifp, 1); if (error == 0) d->bd_promisc = 1; } break; /* * Get current data link type. */ case BIOCGDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; BPF_UNLOCK(); break; /* * Get a list of supported data link types. */ #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: { struct bpf_dltlist32 *list32; struct bpf_dltlist dltlist; list32 = (struct bpf_dltlist32 *)addr; dltlist.bfl_len = list32->bfl_len; dltlist.bfl_list = PTRIN(list32->bfl_list); BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { error = bpf_getdltlist(d, &dltlist); if (error == 0) list32->bfl_len = dltlist.bfl_len; } BPF_UNLOCK(); break; } #endif case BIOCGDLTLIST: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); BPF_UNLOCK(); break; /* * Set data link type. */ case BIOCSDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); BPF_UNLOCK(); break; /* * Get interface name. */ case BIOCGETIF: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; struct ifreq *const ifr = (struct ifreq *)addr; strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } BPF_UNLOCK(); break; /* * Set interface. */ case BIOCSETIF: { int alloc_buf, size; /* * Behavior here depends on the buffering model. If * we're using kernel memory buffers, then we can * allocate them here. If we're using zero-copy, * then the user process must have registered buffers * by the time we get here. */ alloc_buf = 0; BPFD_LOCK(d); if (d->bd_bufmode == BPF_BUFMODE_BUFFER && d->bd_sbuf == NULL) alloc_buf = 1; BPFD_UNLOCK(d); if (alloc_buf) { size = d->bd_bufsize; error = bpf_buffer_ioctl_sblen(d, &size); if (error != 0) break; } BPF_LOCK(); error = bpf_setif(d, (struct ifreq *)addr); BPF_UNLOCK(); break; } /* * Set read timeout. */ case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif { struct timeval *tv = (struct timeval *)addr; #if defined(COMPAT_FREEBSD32) && !defined(__mips__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCSRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv = &tv64; tv->tv_sec = tv32->tv_sec; tv->tv_usec = tv32->tv_usec; } else #endif tv = (struct timeval *)addr; /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(tv)) == 0) d->bd_rtout = tvtohz(tv) - 1; break; } /* * Get read timeout. */ case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif { struct timeval *tv; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCGRTIMEOUT32) tv = &tv64; else #endif tv = (struct timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) if (cmd == BIOCGRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv32->tv_sec = tv->tv_sec; tv32->tv_usec = tv->tv_usec; } #endif break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = (struct bpf_stat *)addr; /* XXXCSJP overflow */ bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount); bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount); break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: BPFD_LOCK(d); d->bd_immediate = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCVERSION: { struct bpf_version *bv = (struct bpf_version *)addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } /* * Get "header already complete" flag */ case BIOCGHDRCMPLT: BPFD_LOCK(d); *(u_int *)addr = d->bd_hdrcmplt; BPFD_UNLOCK(d); break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: BPFD_LOCK(d); d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; BPFD_UNLOCK(d); break; /* * Get packet direction flag */ case BIOCGDIRECTION: BPFD_LOCK(d); *(u_int *)addr = d->bd_direction; BPFD_UNLOCK(d); break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: BPFD_LOCK(d); d->bd_direction = direction; BPFD_UNLOCK(d); break; default: error = EINVAL; } } break; /* * Get packet timestamp format and resolution. */ case BIOCGTSTAMP: BPFD_LOCK(d); *(u_int *)addr = d->bd_tstamp; BPFD_UNLOCK(d); break; /* * Set packet timestamp format and resolution. */ case BIOCSTSTAMP: { u_int func; func = *(u_int *)addr; if (BPF_T_VALID(func)) d->bd_tstamp = func; else error = EINVAL; } break; case BIOCFEEDBACK: BPFD_LOCK(d); d->bd_feedback = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCLOCK: BPFD_LOCK(d); d->bd_locked = 1; BPFD_UNLOCK(d); break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ BPFD_LOCK(d); d->bd_async = *(int *)addr; BPFD_UNLOCK(d); break; case FIOSETOWN: /* * XXX: Add some sort of locking here? * fsetown() can sleep. */ error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: BPFD_LOCK(d); *(int *)addr = fgetown(&d->bd_sigio); BPFD_UNLOCK(d); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)addr = -fgetown(&d->bd_sigio); break; case BIOCSRSIG: /* Set receive signal */ { u_int sig; sig = *(u_int *)addr; if (sig >= NSIG) error = EINVAL; else { BPFD_LOCK(d); d->bd_sig = sig; BPFD_UNLOCK(d); } break; } case BIOCGRSIG: BPFD_LOCK(d); *(u_int *)addr = d->bd_sig; BPFD_UNLOCK(d); break; case BIOCGETBUFMODE: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufmode; BPFD_UNLOCK(d); break; case BIOCSETBUFMODE: /* * Allow the buffering mode to be changed as long as we * haven't yet committed to a particular mode. Our * definition of commitment, for now, is whether or not a * buffer has been allocated or an interface attached, since * that's the point where things get tricky. */ switch (*(u_int *)addr) { case BPF_BUFMODE_BUFFER: break; case BPF_BUFMODE_ZBUF: if (bpf_zerocopy_enable) break; /* FALLSTHROUGH */ default: CURVNET_RESTORE(); return (EINVAL); } BPFD_LOCK(d); if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || d->bd_fbuf != NULL || d->bd_bif != NULL) { BPFD_UNLOCK(d); CURVNET_RESTORE(); return (EBUSY); } d->bd_bufmode = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCGETZMAX: error = bpf_ioctl_getzmax(td, d, (size_t *)addr); break; case BIOCSETZBUF: error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCROTZBUF: error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr); break; } CURVNET_RESTORE(); return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. * * Note we use global lock here to serialize bpf_setf() and bpf_setif() * calls. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { #ifdef COMPAT_FREEBSD32 struct bpf_program fp_swab; struct bpf_program32 *fp32; #endif struct bpf_program_buffer *fcode; struct bpf_insn *filter; #ifdef BPF_JITTER bpf_jit_filter *jfunc; #endif size_t size; u_int flen; bool track_event; #ifdef COMPAT_FREEBSD32 switch (cmd) { case BIOCSETF32: case BIOCSETWF32: case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; switch (cmd) { case BIOCSETF32: cmd = BIOCSETF; break; case BIOCSETWF32: cmd = BIOCSETWF; break; } break; } #endif filter = NULL; #ifdef BPF_JITTER jfunc = NULL; #endif /* * Check new filter validness before acquiring any locks. * Allocate memory for new filter, if needed. */ flen = fp->bf_len; if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0)) return (EINVAL); size = flen * sizeof(*fp->bf_insns); if (size > 0) { /* We're setting up new filter. Copy and check actual data. */ fcode = bpf_program_buffer_alloc(size, M_WAITOK); filter = (struct bpf_insn *)fcode->buffer; if (copyin(fp->bf_insns, filter, size) != 0 || !bpf_validate(filter, flen)) { free(fcode, M_BPF); return (EINVAL); } #ifdef BPF_JITTER if (cmd != BIOCSETWF) { /* * Filter is copied inside fcode and is * perfectly valid. */ jfunc = bpf_jitter(filter, flen); } #endif } track_event = false; fcode = NULL; BPF_LOCK(); BPFD_LOCK(d); /* Set up new filter. */ if (cmd == BIOCSETWF) { if (d->bd_wfilter != NULL) { fcode = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = NULL; #endif } d->bd_wfilter = filter; } else { if (d->bd_rfilter != NULL) { fcode = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = d->bd_bfilter; #endif } d->bd_rfilter = filter; #ifdef BPF_JITTER d->bd_bfilter = jfunc; #endif if (cmd == BIOCSETF) reset_d(d); if (bpf_check_upgrade(cmd, d, filter, flen) != 0) { /* * Filter can be set several times without * specifying interface. In this case just mark d * as reader. */ d->bd_writer = 0; if (d->bd_bif != NULL) { /* * Remove descriptor from writers-only list * and add it to active readers list. */ CK_LIST_REMOVE(d, bd_next); CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist, d, bd_next); CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); track_event = true; } } } BPFD_UNLOCK(d); if (fcode != NULL) - epoch_call(net_epoch_preempt, &fcode->epoch_ctx, - bpf_program_buffer_free); + NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx); if (track_event) EVENTHANDLER_INVOKE(bpf_track, d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1); BPF_UNLOCK(); return (0); } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; struct ifnet *theywant; BPF_LOCK_ASSERT(); theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; /* * At this point, we expect the buffer is already allocated. If not, * return an error. */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: case BPF_BUFMODE_ZBUF: if (d->bd_sbuf == NULL) return (EINVAL); break; default: panic("bpf_setif: bufmode %d", d->bd_bufmode); } if (bp != d->bd_bif) bpf_attachd(d, bp); else { BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); } return (0); } /* * Support for select() and poll() system calls * * Return true iff the specific operation will not block indefinitely. * Otherwise, return false but make a note that a selwakeup() must be done. */ static int bpfpoll(struct cdev *dev, int events, struct thread *td) { struct bpf_d *d; int revents; if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); /* * Refresh PID associated with this descriptor. */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); else { selrecord(td, &d->bd_sel); /* Start the read timeout if necessary. */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } } BPFD_UNLOCK(d); return (revents); } /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. */ int bpfkqfilter(struct cdev *dev, struct knote *kn) { struct bpf_d *d; if (devfs_get_cdevpriv((void **)&d) != 0 || kn->kn_filter != EVFILT_READ) return (1); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); BPFD_UNLOCK(d); return (0); } static void filt_bpfdetach(struct knote *kn) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; knlist_remove(&d->bd_sel.si_note, kn, 0); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; /* * Ignore the hold buffer if it is being copied to user space. */ if (!d->bd_hbuf_in_use && d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } return (ready); } #define BPF_TSTAMP_NONE 0 #define BPF_TSTAMP_FAST 1 #define BPF_TSTAMP_NORMAL 2 #define BPF_TSTAMP_EXTERN 3 static int bpf_ts_quality(int tstype) { if (tstype == BPF_T_NONE) return (BPF_TSTAMP_NONE); if ((tstype & BPF_T_FAST) != 0) return (BPF_TSTAMP_FAST); return (BPF_TSTAMP_NORMAL); } static int bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m) { struct m_tag *tag; int quality; quality = bpf_ts_quality(tstype); if (quality == BPF_TSTAMP_NONE) return (quality); if (m != NULL) { tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL); if (tag != NULL) { *bt = *(struct bintime *)(tag + 1); return (BPF_TSTAMP_EXTERN); } } if (quality == BPF_TSTAMP_NORMAL) binuptime(bt); else getbinuptime(bt); return (quality); } /* * Incoming linkage from device drivers. Process the packet pkt, of length * pktlen, which is stored in a contiguous buffer. The packet is parsed * by each process' filter, and if accepted, stashed into the corresponding * buffer. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int slen; int gottime; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { counter_u64_add(d->bd_rcount, 1); /* * NB: We dont call BPF_CHECK_DIRECTION() here since there * is no way for the caller to indiciate to us whether this * packet is inbound or outbound. In the bpf_mtap() routines, * we use the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; if (bf != NULL) slen = (*(bf->func))(pkt, pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { /* * Filter matches. Let's to acquire write lock. */ BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, NULL); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, pkt, pktlen, slen, bpf_append_bytes, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } #define BPF_CHECK_DIRECTION(d, r, i) \ (((d)->bd_direction == BPF_D_IN && (r) != (i)) || \ ((d)->bd_direction == BPF_D_OUT && (r) == (i))) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. * Locking model is explained in bpf_tap(). */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } /* * Incoming linkage from device drivers, when packet is in * an mbuf chain and to be prepended by a contiguous header. */ void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct mbuf mb; struct bpf_d *d; u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ mb.m_flags = 0; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; pktlen += dlen; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } #undef BPF_CHECK_DIRECTION #undef BPF_TSTAMP_NONE #undef BPF_TSTAMP_FAST #undef BPF_TSTAMP_NORMAL #undef BPF_TSTAMP_EXTERN static int bpf_hdrlen(struct bpf_d *d) { int hdrlen; hdrlen = d->bd_bif->bif_hdrlen; #ifndef BURN_BRIDGES if (d->bd_tstamp == BPF_T_NONE || BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME) #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen = BPF_WORDALIGN32(hdrlen); else #endif hdrlen = BPF_WORDALIGN(hdrlen); return (hdrlen - d->bd_bif->bif_hdrlen); } static void bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype) { struct bintime bt2, boottimebin; struct timeval tsm; struct timespec tsn; if ((tstype & BPF_T_MONOTONIC) == 0) { bt2 = *bt; getboottimebin(&boottimebin); bintime_add(&bt2, &boottimebin); bt = &bt2; } switch (BPF_T_FORMAT(tstype)) { case BPF_T_MICROTIME: bintime2timeval(bt, &tsm); ts->bt_sec = tsm.tv_sec; ts->bt_frac = tsm.tv_usec; break; case BPF_T_NANOTIME: bintime2timespec(bt, &tsn); ts->bt_sec = tsn.tv_sec; ts->bt_frac = tsn.tv_nsec; break; case BPF_T_BINTIME: ts->bt_sec = bt->sec; ts->bt_frac = bt->frac; break; } } /* * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *bt) { struct bpf_xhdr hdr; #ifndef BURN_BRIDGES struct bpf_hdr hdr_old; #ifdef COMPAT_FREEBSD32 struct bpf_hdr32 hdr32_old; #endif #endif int caplen, curlen, hdrlen, totlen; int do_wakeup = 0; int do_timestamp; int tstype; BPFD_LOCK_ASSERT(d); if (d->bd_bif == NULL) { /* Descriptor was detached in concurrent thread */ counter_u64_add(d->bd_dcount, 1); return; } /* * Detect whether user space has released a buffer back to us, and if * so, move it from being a hold buffer to a free buffer. This may * not be the best place to do it (for example, we might only want to * run this check if we need the space), but for now it's a reliable * spot to do it. */ if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ hdrlen = bpf_hdrlen(d); totlen = hdrlen + min(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * Round up the end of the previous packet to the next longword. * * Drop the packet if there's no room and no hope of room * If the packet would overflow the storage buffer or the storage * buffer is considered immutable by the buffer model, try to rotate * the buffer and wakeup pending processes. */ #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) { if (d->bd_fbuf == NULL) { /* * There's no room in the store buffer, and no * prospect of room, so drop the packet. Notify the * buffer model. */ bpf_buffull(d); counter_u64_add(d->bd_dcount, 1); return; } KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use")); ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* * Immediate mode is set, or the read timeout has already * expired during a select call. A packet arrived, so the * reader should be woken up. */ do_wakeup = 1; caplen = totlen - hdrlen; tstype = d->bd_tstamp; do_timestamp = tstype != BPF_T_NONE; #ifndef BURN_BRIDGES if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) { struct bpf_ts ts; if (do_timestamp) bpf_bintime2ts(bt, &ts, tstype); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) { bzero(&hdr32_old, sizeof(hdr32_old)); if (do_timestamp) { hdr32_old.bh_tstamp.tv_sec = ts.bt_sec; hdr32_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr32_old.bh_datalen = pktlen; hdr32_old.bh_hdrlen = hdrlen; hdr32_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old, sizeof(hdr32_old)); goto copy; } #endif bzero(&hdr_old, sizeof(hdr_old)); if (do_timestamp) { hdr_old.bh_tstamp.tv_sec = ts.bt_sec; hdr_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr_old.bh_datalen = pktlen; hdr_old.bh_hdrlen = hdrlen; hdr_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old, sizeof(hdr_old)); goto copy; } #endif /* * Append the bpf header. Note we append the actual header size, but * move forward the length of the header plus padding. */ bzero(&hdr, sizeof(hdr)); if (do_timestamp) bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype); hdr.bh_datalen = pktlen; hdr.bh_hdrlen = hdrlen; hdr.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); /* * Copy the packet data into the store buffer and update its length. */ #ifndef BURN_BRIDGES copy: #endif (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen); d->bd_slen = curlen + totlen; if (do_wakeup) bpf_wakeup(d); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpfd_free(epoch_context_t ctx) { struct bpf_d *d; struct bpf_program_buffer *p; /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ d = __containerof(ctx, struct bpf_d, epoch_ctx); bpf_free(d); if (d->bd_rfilter != NULL) { p = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = d->bd_bfilter; #endif bpf_program_buffer_free(&p->epoch_ctx); } if (d->bd_wfilter != NULL) { p = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = NULL; #endif bpf_program_buffer_free(&p->epoch_ctx); } mtx_destroy(&d->bd_lock); counter_u64_free(d->bd_rcount); counter_u64_free(d->bd_dcount); counter_u64_free(d->bd_fcount); counter_u64_free(d->bd_wcount); counter_u64_free(d->bd_wfcount); counter_u64_free(d->bd_wdcount); counter_u64_free(d->bd_zcopy); free(d, M_BPF); } /* * Attach an interface to bpf. dlt is the link layer type; hdrlen is the * fixed size of the link header (variable length headers not yet supported). */ void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } /* * Attach an interface to bpf. ifp is a pointer to the structure * defining the interface to be attached, dlt is the link layer type, * and hdrlen is the fixed size of the link header (variable length * headers are not yet supporrted). */ void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO); CK_LIST_INIT(&bp->bif_dlist); CK_LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; bp->bif_hdrlen = hdrlen; bp->bif_bpf = driverp; bp->bif_refcnt = 1; *driverp = bp; /* * Reference ifnet pointer, so it won't freed until * we release it. */ if_ref(ifp); BPF_LOCK(); CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); BPF_UNLOCK(); if (bootverbose && IS_DEFAULT_VNET(curvnet)) if_printf(ifp, "bpf attached\n"); } #ifdef VIMAGE /* * When moving interfaces between vnet instances we need a way to * query the dlt and hdrlen before detach so we can re-attch the if_bpf * after the vmove. We unfortunately have no device driver infrastructure * to query the interface for these values after creation/attach, thus * add this as a workaround. */ int bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen) { if (bp == NULL) return (ENXIO); if (bif_dlt == NULL && bif_hdrlen == NULL) return (0); if (bif_dlt != NULL) *bif_dlt = bp->bif_dlt; if (bif_hdrlen != NULL) *bif_hdrlen = bp->bif_hdrlen; return (0); } #endif /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface. Notify each descriptor as it's detached * so that any sleepers wake up and get ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp, *bp_temp; struct bpf_d *d; BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; CK_LIST_REMOVE(bp, bif_next); *bp->bif_bpf = (struct bpf_if *)&dead_bpf_if; CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", __func__, bp->bif_dlt, bp, ifp); /* Detach common descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) { bpf_detachd_locked(d, true); } /* Detach writer-only descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) { bpf_detachd_locked(d, true); } bpfif_rele(bp); } BPF_UNLOCK(); } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { struct ifnet *ifp; struct bpf_if *bp; u_int *lst; int error, n, n1; BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; n1 = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp) n1++; } if (bfl->bfl_list == NULL) { bfl->bfl_len = n1; return (0); } if (n1 > bfl->bfl_len) return (ENOMEM); lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK); n = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; lst[n++] = bp->bif_dlt; } error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n); free(lst, M_TEMP); bfl->bfl_len = n; return (error); } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; BPF_LOCK_ASSERT(); MPASS(d->bd_bif != NULL); /* * It is safe to check bd_bif without BPFD_LOCK, it can not be * changed while we hold global lock. */ if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } if (bp == NULL) return (EINVAL); opromisc = d->bd_promisc; bpf_attachd(d, bp); if (opromisc) { error = ifpromisc(bp->bif_ifp, 1); if (error) if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n", __func__, error); else d->bd_promisc = 1; } return (0); } static void bpf_drvinit(void *unused) { struct cdev *dev; sx_init(&bpf_sx, "bpf global lock"); CK_LIST_INIT(&bpf_iflist); dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); } /* * Zero out the various packet counters associated with all of the bpf * descriptors. At some point, we will probably want to get a bit more * granular and allow the user to specify descriptors to be zeroed. */ static void bpf_zero_counters(void) { struct bpf_if *bp; struct bpf_d *bd; BPF_LOCK(); /* * We are protected by global lock here, interfaces and * descriptors can not be deleted while we hold it. */ CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { counter_u64_zero(bd->bd_rcount); counter_u64_zero(bd->bd_dcount); counter_u64_zero(bd->bd_fcount); counter_u64_zero(bd->bd_wcount); counter_u64_zero(bd->bd_wfcount); counter_u64_zero(bd->bd_zcopy); } } BPF_UNLOCK(); } /* * Fill filter statistics */ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { BPF_LOCK_ASSERT(); bzero(d, sizeof(*d)); d->bd_structsize = sizeof(*d); d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; d->bd_direction = bd->bd_direction; d->bd_feedback = bd->bd_feedback; d->bd_async = bd->bd_async; d->bd_rcount = counter_u64_fetch(bd->bd_rcount); d->bd_dcount = counter_u64_fetch(bd->bd_dcount); d->bd_fcount = counter_u64_fetch(bd->bd_fcount); d->bd_sig = bd->bd_sig; d->bd_slen = bd->bd_slen; d->bd_hlen = bd->bd_hlen; d->bd_bufsize = bd->bd_bufsize; d->bd_pid = bd->bd_pid; strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; d->bd_wcount = counter_u64_fetch(bd->bd_wcount); d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount); d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount); d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy); d->bd_bufmode = bd->bd_bufmode; } /* * Handle `netstat -B' stats request */ static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { static const struct xbpf_d zerostats; struct xbpf_d *xbdbuf, *xbd, tempstats; int index, error; struct bpf_if *bp; struct bpf_d *bd; /* * XXX This is not technically correct. It is possible for non * privileged users to open bpf devices. It would make sense * if the users who opened the devices were able to retrieve * the statistics for them, too. */ error = priv_check(req->td, PRIV_NET_BPF); if (error) return (error); /* * Check to see if the user is requesting that the counters be * zeroed out. Explicitly check that the supplied data is zeroed, * as we aren't allowing the user to set the counters currently. */ if (req->newptr != NULL) { if (req->newlen != sizeof(tempstats)) return (EINVAL); memset(&tempstats, 0, sizeof(tempstats)); error = SYSCTL_IN(req, &tempstats, sizeof(tempstats)); if (error) return (error); if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0) return (EINVAL); bpf_zero_counters(); return (0); } if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); BPF_LOCK(); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { BPF_UNLOCK(); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { /* Send writers-only first */ CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } } BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); } SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL); #else /* !DEV_BPF && !NETGRAPH_BPF */ /* * NOP stubs to allow bpf-using drivers to load and function. * * A 'better' implementation would allow the core bpf functionality * to be loaded at runtime. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { } void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { } void bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) { } void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { *driverp = (struct bpf_if *)&dead_bpf_if; } void bpfdetach(struct ifnet *ifp) { } u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { return -1; /* "no filter" behaviour */ } int bpf_validate(const struct bpf_insn *f, int len) { return 0; /* false */ } #endif /* !DEV_BPF && !NETGRAPH_BPF */ #ifdef DDB static void bpf_show_bpf_if(struct bpf_if *bpf_if) { if (bpf_if == NULL) return; db_printf("%p:\n", bpf_if); #define BPF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, bpf_if->e); /* bif_ext.bif_next */ /* bif_ext.bif_dlist */ BPF_DB_PRINTF("%#x", bif_dlt); BPF_DB_PRINTF("%u", bif_hdrlen); /* bif_wlist */ BPF_DB_PRINTF("%p", bif_ifp); BPF_DB_PRINTF("%p", bif_bpf); BPF_DB_PRINTF("%u", bif_refcnt); } DB_SHOW_COMMAND(bpf_if, db_show_bpf_if) { if (!have_addr) { db_printf("usage: show bpf_if \n"); return; } bpf_show_bpf_if((struct bpf_if *)addr); } #endif Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 356754) +++ head/sys/net/if.c (revision 356755) @@ -1,4556 +1,4556 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_bpf.h" #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); static void if_siocaddmulti(void *, int); #ifdef VIMAGE static int if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); VNET_DEFINE_STATIC(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; if (__predict_false(idx > V_if_index)) return (NULL); ifp = *(struct ifnet * const volatile *)(V_ifindex_table + idx); return (__predict_false(ifp == IFNET_HOLD) ? NULL : ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; NET_EPOCH_ASSERT(); ifp = ifnet_byindex(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) return (NULL); if_ref(ifp); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { *old = if_grow(); return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { V_ifindex_table[idx] = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifnet *ifp; struct ifaddr *ifa = NULL; NET_EPOCH_ASSERT(); ifp = ifnet_byindex(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { void *old; CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; void *old; old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); if (numa_domain == IF_NODOM) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO); else ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); if (__predict_false(idx == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; ifp->if_numa_domain = numa_domain; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } struct ifnet * if_alloc_dev(u_char type, device_t dev) { int numa_domain; if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) return (if_alloc_domain(type, IF_NODOM)); return (if_alloc_domain(type, numa_domain)); } struct ifnet * if_alloc(u_char type) { return (if_alloc_domain(type, IF_NODOM)); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); if (ifp->if_numa_domain == IF_NODOM) free(ifp, M_IFNET); else free_domain(ifp, M_IFNET); } static void if_destroy(epoch_context_t ctx) { struct ifnet *ifp; ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); if_free_internal(ifp); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) - epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); + NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; - epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); + NET_EPOCH_CALL(if_destroy, &ifp->if_epoch_ctx); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT); } SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa; while (1) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } NET_EPOCH_EXIT(et); if (ifa == NULL) break; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE bool shutdown; shutdown = ifp->if_vnet->vnet_shutdown; #endif IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); if (!vmove) ifp->if_flags |= IFF_DYING; found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late destruction of multicast options * which lead to leave group calls, which in turn access the * belonging ifnet structure: */ epoch_drain_callbacks(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Clean up all addresses. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static int if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; #ifdef DEV_BPF u_int bif_dlt, bif_hdrlen; #endif void *old; int rc; #ifdef DEV_BPF /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); #endif /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return (rc); /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); restart: IFNET_WLOCK(); ifp->if_index = ifindex_alloc(&old); if (__predict_false(ifp->if_index == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); #ifdef DEV_BPF if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); #endif CURVNET_RESTORE(); return (0); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int error; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ if (ifp->if_vnet->vnet_shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ error = if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (error); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int error; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ if (ifp->if_vnet->vnet_shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ error = if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland on success. */ if (error == 0) sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (error); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Helper function to remove a group out of an interface. Expects the global * ifnet lock to be write-locked, and drops it before returning. */ static void _if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl, const char *groupname) { struct ifg_member *ifgm; bool freeifgl; IFNET_WLOCK_ASSERT(); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) { if (ifgm->ifgm_ifp == ifp) { CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); break; } } if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = true; } else { freeifgl = false; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } _if_delgroup_locked(ifp, ifgl, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) { strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); _if_delgroup_locked(ifp, ifgl, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; NET_EPOCH_ASSERT(); if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) return (EINVAL); bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) return (error); len -= sizeof(ifgrq); ifgp++; } return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) - epoch_call(net_epoch_preempt, &ifa->ifa_epoch_ctx, ifa_destroy); + NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx); } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { struct epoch_tracker et; int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; struct ifaddr *rti_ifa = NULL; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; if (cmd == RTM_ADD) { /* explicitly specify (loopback) ifa */ if (info.rti_ifp != NULL) { NET_EPOCH_ENTER(et); rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); if (rti_ifa != NULL) ifa_ref(rti_ifa); info.rti_ifa = rti_ifa; NET_EPOCH_EXIT(et); } } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (rti_ifa != NULL) ifa_free(rti_ifa); if (error != 0 && !(cmd == RTM_ADD && error == EEXIST) && !(cmd == RTM_DELETE && error == ENOENT)) if_printf(ifp, "%s failed: %d\n", otype, error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { struct epoch_tracker et; int rc; NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(et); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; NET_EPOCH_ASSERT(); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_pcp_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; /* XXXGL: reference ifp? */ taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp; int link_state; ifp = arg; link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } NET_EPOCH_EXIT(et); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (strcmp(new_name, ifp->if_xname) == 0) break; if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET DEBUGNET_NOTIFY_MTU(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct epoch_tracker et; struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): { struct epoch_tracker et; NET_EPOCH_ENTER(et); error = if_getgroup((struct ifgroupreq *)data, ifp); NET_EPOCH_EXIT(et); break; } case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; struct ifmediareq *ifmrp = NULL; #endif struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ if (so->so_vnet->vnet_shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); goto out_noref; #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); if (error == 0) ifc32->ifc_len = ifc.ifc_len; goto out_noref; } #endif } #ifdef COMPAT_FREEBSD32 switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { struct epoch_tracker et; int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); - epoch_call(net_epoch_preempt, &ifma->ifma_epoch_ctx, if_destroymulti); + NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } ll_ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { if (THREAD_CAN_SLEEP()) (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); else taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } static void if_siocaddmulti(void *arg, int pending) { struct ifnet *ifp; ifp = arg; #ifdef DIAGNOSTIC if (pending > 1) if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending); #endif CURVNET_SET(ifp->if_vnet); (void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); CURVNET_RESTORE(); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; KASSERT(ifp, ("%s: NULL ifp", __func__)); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; NET_EPOCH_EXIT(et); if (ifp != oifp) ifp = NULL; } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; ifa = ifp->if_addr; if (ifa == NULL) return (EINVAL); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) return (EINVAL); if (len != sdl->sdl_alen) /* don't allow length to change */ return (EINVAL); switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Tunnel interfaces can nest, also they may cause infinite recursion * calls when misconfigured. We'll prevent this by detecting loops. * High nesting level may cause stack exhaustion. We'll prevent this * by introducing upper limit. * * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie, int limit) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > limit) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { char if_fmt[256]; va_list ap; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); va_start(ap, fmt); vlog(LOG_INFO, if_fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } /* * Methods for drivers to access interface unicast and multicast * link level addresses. Driver shall not know 'struct ifaddr' neither * 'struct ifmultiaddr'. */ u_int if_lladdr_count(if_t ifp) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } u_int if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifaddr *ifa; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifa->ifa_addr, count); } NET_EPOCH_EXIT(et); return (count); } u_int if_llmaddr_count(if_t ifp) { struct epoch_tracker et; struct ifmultiaddr *ifma; int count; count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) if (ifma->ifma_addr->sa_family == AF_LINK) count++; NET_EPOCH_EXIT(et); return (count); } u_int if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg) { struct epoch_tracker et; struct ifmultiaddr *ifma; u_int count; MPASS(cb); count = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count += (*cb)(cb_arg, (struct sockaddr_dl *)ifma->ifma_addr, count); } NET_EPOCH_EXIT(et); return (count); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: head/sys/net/if_gre.c =================================================================== --- head/sys/net/if_gre.c (revision 356754) +++ head/sys/net/if_gre.c (revision 356755) @@ -1,820 +1,820 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * IPv6-over-GRE contributed by Gert Doering * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #ifdef RSS #include #endif #endif #ifdef INET6 #include #include #include #ifdef RSS #include #endif #endif #include #include #include #include #include #include #define GREMTU 1476 static const char grename[] = "gre"; MALLOC_DEFINE(M_GRE, grename, "Generic Routing Encapsulation"); static struct sx gre_ioctl_sx; SX_SYSINIT(gre_ioctl_sx, &gre_ioctl_sx, "gre_ioctl"); static int gre_clone_create(struct if_clone *, int, caddr_t); static void gre_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, gre_cloner); #define V_gre_cloner VNET(gre_cloner) static void gre_qflush(struct ifnet *); static int gre_transmit(struct ifnet *, struct mbuf *); static int gre_ioctl(struct ifnet *, u_long, caddr_t); static int gre_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void gre_delete_tunnel(struct gre_softc *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW, 0, "Generic Routing Encapsulation"); #ifndef MAX_GRE_NEST /* * This macro controls the default upper limitation on nesting of gre tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gre tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GRE_NEST 1 #endif VNET_DEFINE_STATIC(int, max_gre_nesting) = MAX_GRE_NEST; #define V_max_gre_nesting VNET(max_gre_nesting) SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_gre_nesting), 0, "Max nested tunnels"); static void vnet_gre_init(const void *unused __unused) { V_gre_cloner = if_clone_simple(grename, gre_clone_create, gre_clone_destroy, 0); #ifdef INET in_gre_init(); #endif #ifdef INET6 in6_gre_init(); #endif } VNET_SYSINIT(vnet_gre_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_init, NULL); static void vnet_gre_uninit(const void *unused __unused) { if_clone_detach(V_gre_cloner); #ifdef INET in_gre_uninit(); #endif #ifdef INET6 in6_gre_uninit(); #endif /* XXX: epoch_call drain */ } VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_uninit, NULL); static int gre_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gre_softc *sc; sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK | M_ZERO); sc->gre_fibnum = curthread->td_proc->p_fibnum; GRE2IFP(sc) = if_alloc(IFT_TUNNEL); GRE2IFP(sc)->if_softc = sc; if_initname(GRE2IFP(sc), grename, unit); GRE2IFP(sc)->if_mtu = GREMTU; GRE2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; GRE2IFP(sc)->if_output = gre_output; GRE2IFP(sc)->if_ioctl = gre_ioctl; GRE2IFP(sc)->if_transmit = gre_transmit; GRE2IFP(sc)->if_qflush = gre_qflush; GRE2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; GRE2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GRE2IFP(sc)); bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t)); return (0); } static void gre_clone_destroy(struct ifnet *ifp) { struct gre_softc *sc; sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; gre_delete_tunnel(sc); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gre_ioctl_sx); GRE_WAIT(); if_free(ifp); free(sc, M_GRE); } static int gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct gre_softc *sc; uint32_t opt; int error; switch (cmd) { case SIOCSIFMTU: /* XXX: */ if (ifr->ifr_mtu < 576) return (EINVAL); ifp->if_mtu = ifr->ifr_mtu; return (0); case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: return (0); case GRESADDRS: case GRESADDRD: case GREGADDRS: case GREGADDRD: case GRESPROTO: case GREGPROTO: return (EOPNOTSUPP); } sx_xlock(&gre_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto end; } error = 0; switch (cmd) { case SIOCDIFPHYADDR: if (sc->gre_family == 0) break; gre_delete_tunnel(sc); break; #ifdef INET case SIOCSIFPHYADDR: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = in_gre_ioctl(sc, cmd, data); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = in6_gre_ioctl(sc, cmd, data); break; #endif case SIOCGTUNFIB: ifr->ifr_fib = sc->gre_fibnum; break; case SIOCSTUNFIB: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if (ifr->ifr_fib >= rt_numfibs) error = EINVAL; else sc->gre_fibnum = ifr->ifr_fib; break; case GRESKEY: case GRESOPTS: case GRESPORT: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, sizeof(opt))) != 0) break; if (cmd == GRESKEY) { if (sc->gre_key == opt) break; } else if (cmd == GRESOPTS) { if (opt & ~GRE_OPTMASK) { error = EINVAL; break; } if (sc->gre_options == opt) break; } else if (cmd == GRESPORT) { if (opt != 0 && (opt < V_ipport_hifirstauto || opt > V_ipport_hilastauto)) { error = EINVAL; break; } if (sc->gre_port == opt) break; if ((sc->gre_options & GRE_UDPENCAP) == 0) { /* * UDP encapsulation is not enabled, thus * there is no need to reattach softc. */ sc->gre_port = opt; break; } } switch (sc->gre_family) { #ifdef INET case AF_INET: error = in_gre_setopts(sc, cmd, opt); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_setopts(sc, cmd, opt); break; #endif default: /* * Tunnel is not yet configured. * We can just change any parameters. */ if (cmd == GRESKEY) sc->gre_key = opt; if (cmd == GRESOPTS) sc->gre_options = opt; if (cmd == GRESPORT) sc->gre_port = opt; break; } /* * XXX: Do we need to initiate change of interface * state here? */ break; case GREGKEY: error = copyout(&sc->gre_key, ifr_data_get_ptr(ifr), sizeof(sc->gre_key)); break; case GREGOPTS: error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr), sizeof(sc->gre_options)); break; case GREGPORT: error = copyout(&sc->gre_port, ifr_data_get_ptr(ifr), sizeof(sc->gre_port)); break; default: error = EINVAL; break; } if (error == 0 && sc->gre_family != 0) { if ( #ifdef INET cmd == SIOCSIFPHYADDR || #endif #ifdef INET6 cmd == SIOCSIFPHYADDR_IN6 || #endif 0) { if_link_state_change(ifp, LINK_STATE_UP); } } end: sx_xunlock(&gre_ioctl_sx); return (error); } static void gre_delete_tunnel(struct gre_softc *sc) { struct gre_socket *gs; sx_assert(&gre_ioctl_sx, SA_XLOCKED); if (sc->gre_family != 0) { CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); free(sc->gre_hdr, M_GRE); sc->gre_family = 0; } /* * If this Tunnel was the last one that could use UDP socket, * we should unlink socket from hash table and close it. */ if ((gs = sc->gre_so) != NULL && CK_LIST_EMPTY(&gs->list)) { CK_LIST_REMOVE(gs, chain); soclose(gs->so); - epoch_call(net_epoch_preempt, &gs->epoch_ctx, gre_sofree); + NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx); sc->gre_so = NULL; } GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GRE2IFP(sc), LINK_STATE_DOWN); } struct gre_list * gre_hashinit(void) { struct gre_list *hash; int i; hash = malloc(sizeof(struct gre_list) * GRE_HASH_SIZE, M_GRE, M_WAITOK); for (i = 0; i < GRE_HASH_SIZE; i++) CK_LIST_INIT(&hash[i]); return (hash); } void gre_hashdestroy(struct gre_list *hash) { free(hash, M_GRE); } void gre_sofree(epoch_context_t ctx) { struct gre_socket *gs; gs = __containerof(ctx, struct gre_socket, epoch_ctx); free(gs, M_GRE); } static __inline uint16_t gre_cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } void gre_update_udphdr(struct gre_softc *sc, struct udphdr *udp, uint16_t csum) { sx_assert(&gre_ioctl_sx, SA_XLOCKED); MPASS(sc->gre_options & GRE_UDPENCAP); udp->uh_dport = htons(GRE_UDPPORT); udp->uh_sport = htons(sc->gre_port); udp->uh_sum = csum; udp->uh_ulen = 0; } void gre_update_hdr(struct gre_softc *sc, struct grehdr *gh) { uint32_t *opts; uint16_t flags; sx_assert(&gre_ioctl_sx, SA_XLOCKED); flags = 0; opts = gh->gre_opts; if (sc->gre_options & GRE_ENABLE_CSUM) { flags |= GRE_FLAGS_CP; sc->gre_hlen += 2 * sizeof(uint16_t); *opts++ = 0; } if (sc->gre_key != 0) { flags |= GRE_FLAGS_KP; sc->gre_hlen += sizeof(uint32_t); *opts++ = htonl(sc->gre_key); } if (sc->gre_options & GRE_ENABLE_SEQ) { flags |= GRE_FLAGS_SP; sc->gre_hlen += sizeof(uint32_t); *opts++ = 0; } else sc->gre_oseq = 0; gh->gre_flags = htons(flags); } int gre_input(struct mbuf *m, int off, int proto, void *arg) { struct gre_softc *sc = arg; struct grehdr *gh; struct ifnet *ifp; uint32_t *opts; #ifdef notyet uint32_t key; #endif uint16_t flags; int hlen, isr, af; ifp = GRE2IFP(sc); hlen = off + sizeof(struct grehdr) + 4 * sizeof(uint32_t); if (m->m_pkthdr.len < hlen) goto drop; if (m->m_len < hlen) { m = m_pullup(m, hlen); if (m == NULL) goto drop; } gh = (struct grehdr *)mtodo(m, off); flags = ntohs(gh->gre_flags); if (flags & ~GRE_FLAGS_MASK) goto drop; opts = gh->gre_opts; hlen = 2 * sizeof(uint16_t); if (flags & GRE_FLAGS_CP) { /* reserved1 field must be zero */ if (((uint16_t *)opts)[1] != 0) goto drop; if (in_cksum_skip(m, m->m_pkthdr.len, off) != 0) goto drop; hlen += 2 * sizeof(uint16_t); opts++; } if (flags & GRE_FLAGS_KP) { #ifdef notyet /* * XXX: The current implementation uses the key only for outgoing * packets. But we can check the key value here, or even in the * encapcheck function. */ key = ntohl(*opts); #endif hlen += sizeof(uint32_t); opts++; } #ifdef notyet } else key = 0; if (sc->gre_key != 0 && (key != sc->gre_key || key != 0)) goto drop; #endif if (flags & GRE_FLAGS_SP) { #ifdef notyet seq = ntohl(*opts); #endif hlen += sizeof(uint32_t); } switch (ntohs(gh->gre_proto)) { case ETHERTYPE_WCCP: /* * For WCCP skip an additional 4 bytes if after GRE header * doesn't follow an IP header. */ if (flags == 0 && (*(uint8_t *)gh->gre_opts & 0xF0) != 0x40) hlen += sizeof(uint32_t); /* FALLTHROUGH */ case ETHERTYPE_IP: isr = NETISR_IP; af = AF_INET; break; case ETHERTYPE_IPV6: isr = NETISR_IPV6; af = AF_INET6; break; default: goto drop; } m_adj(m, off + hlen); m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if ((ifp->if_flags & IFF_MONITOR) != 0) m_freem(m); else netisr_dispatch(isr, m); return (IPPROTO_DONE); drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return (IPPROTO_DONE); } static int gre_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { uint32_t af; if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gre_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); } static void gre_setseqn(struct grehdr *gh, uint32_t seq) { uint32_t *opts; uint16_t flags; opts = gh->gre_opts; flags = ntohs(gh->gre_flags); KASSERT((flags & GRE_FLAGS_SP) != 0, ("gre_setseqn called, but GRE_FLAGS_SP isn't set ")); if (flags & GRE_FLAGS_CP) opts++; if (flags & GRE_FLAGS_KP) opts++; *opts = htonl(seq); } static uint32_t gre_flowid(struct gre_softc *sc, struct mbuf *m, uint32_t af) { uint32_t flowid; if ((sc->gre_options & GRE_UDPENCAP) == 0 || sc->gre_port != 0) return (0); #ifndef RSS switch (af) { #ifdef INET case AF_INET: flowid = mtod(m, struct ip *)->ip_src.s_addr ^ mtod(m, struct ip *)->ip_dst.s_addr; break; #endif #ifdef INET6 case AF_INET6: flowid = mtod(m, struct ip6_hdr *)->ip6_src.s6_addr32[3] ^ mtod(m, struct ip6_hdr *)->ip6_dst.s6_addr32[3]; break; #endif default: flowid = 0; } #else /* RSS */ switch (af) { #ifdef INET case AF_INET: flowid = rss_hash_ip4_2tuple(mtod(m, struct ip *)->ip_src, mtod(m, struct ip *)->ip_dst); break; #endif #ifdef INET6 case AF_INET6: flowid = rss_hash_ip6_2tuple( &mtod(m, struct ip6_hdr *)->ip6_src, &mtod(m, struct ip6_hdr *)->ip6_dst); break; #endif default: flowid = 0; } #endif return (flowid); } #define MTAG_GRE 1307983903 static int gre_transmit(struct ifnet *ifp, struct mbuf *m) { GRE_RLOCK_TRACKER; struct gre_softc *sc; struct grehdr *gh; struct udphdr *uh; uint32_t af, flowid; int error, len; uint16_t proto; len = 0; GRE_RLOCK(); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); goto drop; } #endif error = ENETDOWN; sc = ifp->if_softc; if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->gre_family == 0 || (error = if_tunnel_check_nesting(ifp, m, MTAG_GRE, V_max_gre_nesting)) != 0) { m_freem(m); goto drop; } af = m->m_pkthdr.csum_data; BPF_MTAP2(ifp, &af, sizeof(af), m); m->m_flags &= ~(M_BCAST|M_MCAST); flowid = gre_flowid(sc, m, af); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto drop; } bcopy(sc->gre_hdr, mtod(m, void *), sc->gre_hlen); /* Determine GRE proto */ switch (af) { #ifdef INET case AF_INET: proto = htons(ETHERTYPE_IP); break; #endif #ifdef INET6 case AF_INET6: proto = htons(ETHERTYPE_IPV6); break; #endif default: m_freem(m); error = ENETDOWN; goto drop; } /* Determine offset of GRE header */ switch (sc->gre_family) { #ifdef INET case AF_INET: len = sizeof(struct ip); break; #endif #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr); break; #endif default: m_freem(m); error = ENETDOWN; goto drop; } if (sc->gre_options & GRE_UDPENCAP) { uh = (struct udphdr *)mtodo(m, len); uh->uh_sport |= htons(V_ipport_hifirstauto) | (flowid >> 16) | (flowid & 0xFFFF); uh->uh_sport = htons(ntohs(uh->uh_sport) % V_ipport_hilastauto); uh->uh_ulen = htons(m->m_pkthdr.len - len); uh->uh_sum = gre_cksum_add(uh->uh_sum, htons(m->m_pkthdr.len - len + IPPROTO_UDP)); m->m_pkthdr.csum_flags = sc->gre_csumflags; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); len += sizeof(struct udphdr); } gh = (struct grehdr *)mtodo(m, len); gh->gre_proto = proto; if (sc->gre_options & GRE_ENABLE_SEQ) gre_setseqn(gh, sc->gre_oseq++); if (sc->gre_options & GRE_ENABLE_CSUM) { *(uint16_t *)gh->gre_opts = in_cksum_skip(m, m->m_pkthdr.len, len); } len = m->m_pkthdr.len - len; switch (sc->gre_family) { #ifdef INET case AF_INET: error = in_gre_output(m, af, sc->gre_hlen); break; #endif #ifdef INET6 case AF_INET6: error = in6_gre_output(m, af, sc->gre_hlen, flowid); break; #endif default: m_freem(m); error = ENETDOWN; } drop: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); } GRE_RUNLOCK(); return (error); } static void gre_qflush(struct ifnet *ifp __unused) { } static int gremodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gre_mod = { "if_gre", gremodevent, 0 }; DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gre, 1); Index: head/sys/net/if_lagg.c =================================================================== --- head/sys/net/if_lagg.c (revision 356754) +++ head/sys/net/if_lagg.c (revision 356755) @@ -1,2401 +1,2401 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * Copyright (c) 2014, 2016 Marcelo Araujo * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include #define LAGG_RLOCK() struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RLOCK_ASSERT() NET_EPOCH_ASSERT() #define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch(net_epoch_preempt)) #define LAGG_SX_INIT(_sc) sx_init(&(_sc)->sc_sx, "if_lagg sx") #define LAGG_SX_DESTROY(_sc) sx_destroy(&(_sc)->sc_sx) #define LAGG_XLOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define LAGG_XUNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define LAGG_SXLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_LOCKED) #define LAGG_XLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_XLOCKED) /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; struct lagg_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx); #define V_lagg_list_mtx VNET(lagg_list_mtx) #define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ "if_lagg list", NULL, MTX_DEF) #define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) #define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface"); static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); #if defined(KERN_TLS) || defined(RATELIMIT) static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int lagg_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int lagg_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void lagg_snd_tag_free(struct m_snd_tag *); static void lagg_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); static int lagg_setcaps(struct lagg_port *, int cap); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); /* Simple round robin */ static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *); static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* Broadcast */ static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *); static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ static const struct lagg_proto { lagg_proto pr_num; void (*pr_attach)(struct lagg_softc *); void (*pr_detach)(struct lagg_softc *); int (*pr_start)(struct lagg_softc *, struct mbuf *); struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, struct mbuf *); int (*pr_addport)(struct lagg_port *); void (*pr_delport)(struct lagg_port *); void (*pr_linkstate)(struct lagg_port *); void (*pr_init)(struct lagg_softc *); void (*pr_stop)(struct lagg_softc *); void (*pr_lladdr)(struct lagg_softc *); void (*pr_request)(struct lagg_softc *, void *); void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { { .pr_num = LAGG_PROTO_NONE }, { .pr_num = LAGG_PROTO_ROUNDROBIN, .pr_attach = lagg_rr_attach, .pr_start = lagg_rr_start, .pr_input = lagg_rr_input, }, { .pr_num = LAGG_PROTO_FAILOVER, .pr_start = lagg_fail_start, .pr_input = lagg_fail_input, }, { .pr_num = LAGG_PROTO_LOADBALANCE, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, .pr_addport = lagg_lb_port_create, .pr_delport = lagg_lb_port_destroy, }, { .pr_num = LAGG_PROTO_LACP, .pr_attach = lagg_lacp_attach, .pr_detach = lagg_lacp_detach, .pr_start = lagg_lacp_start, .pr_input = lagg_lacp_input, .pr_addport = lacp_port_create, .pr_delport = lacp_port_destroy, .pr_linkstate = lacp_linkstate, .pr_init = lacp_init, .pr_stop = lacp_stop, .pr_lladdr = lagg_lacp_lladdr, .pr_request = lacp_req, .pr_portreq = lacp_portreq, }, { .pr_num = LAGG_PROTO_BROADCAST, .pr_start = lagg_bcast_start, .pr_input = lagg_bcast_input, }, }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); /* Allow input on any failover links */ VNET_DEFINE_STATIC(int, lagg_failover_rx_all); #define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); /* Default value for using flowid */ VNET_DEFINE_STATIC(int, def_use_flowid) = 0; #define V_def_use_flowid VNET(def_use_flowid) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); /* Default value for using numa */ VNET_DEFINE_STATIC(int, def_use_numa) = 1; #define V_def_use_numa VNET(def_use_numa) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN, &VNET_NAME(def_use_numa), 0, "Use numa to steer flows"); /* Default value for flowid shift */ VNET_DEFINE_STATIC(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); static void vnet_lagg_init(const void *unused __unused) { LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, lagg_clone_destroy, 0); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); static void vnet_lagg_uninit(const void *unused __unused) { if_clone_detach(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_lagg_uninit, NULL); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); lagg_input_p = NULL; lagg_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) { LAGG_XLOCK_ASSERT(sc); KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", __func__, sc)); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "using proto %u\n", pr); if (lagg_protos[pr].pr_attach != NULL) lagg_protos[pr].pr_attach(sc); sc->sc_proto = pr; } static void lagg_proto_detach(struct lagg_softc *sc) { lagg_proto pr; LAGG_XLOCK_ASSERT(sc); pr = sc->sc_proto; sc->sc_proto = LAGG_PROTO_NONE; if (lagg_protos[pr].pr_detach != NULL) lagg_protos[pr].pr_detach(sc); } static int lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_start(sc, m)); } static struct mbuf * lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); } static int lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_addport == NULL) return (0); else return (lagg_protos[sc->sc_proto].pr_addport(lp)); } static void lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_delport != NULL) lagg_protos[sc->sc_proto].pr_delport(lp); } static void lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) lagg_protos[sc->sc_proto].pr_linkstate(lp); } static void lagg_proto_init(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_init != NULL) lagg_protos[sc->sc_proto].pr_init(sc); } static void lagg_proto_stop(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_stop != NULL) lagg_protos[sc->sc_proto].pr_stop(sc); } static void lagg_proto_lladdr(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) lagg_protos[sc->sc_proto].pr_lladdr(sc); } static void lagg_proto_request(struct lagg_softc *sc, void *v) { if (lagg_protos[sc->sc_proto].pr_request != NULL) lagg_protos[sc->sc_proto].pr_request(sc, v); } static void lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) { if (lagg_protos[sc->sc_proto].pr_portreq != NULL) lagg_protos[sc->sc_proto].pr_portreq(lp, v); } /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_LAGG); return (ENOSPC); } LAGG_SX_INIT(sc); LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; if (V_def_use_numa) sc->sc_opts |= LAGG_OPT_USE_NUMA; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4; lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); CK_SLIST_INIT(&sc->sc_ports); /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; ifp->if_snd_tag_modify = lagg_snd_tag_modify; ifp->if_snd_tag_query = lagg_snd_tag_query; ifp->if_snd_tag_free = lagg_snd_tag_free; ifp->if_ratelimit_query = lagg_ratelimit_query; #endif ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG. */ ether_ifattach(ifp, eaddr); sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ LAGG_LIST_LOCK(); SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_XUNLOCK(sc); return (0); } static void lagg_clone_destroy(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_XLOCK(sc); sc->sc_destroying = 1; lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ lagg_proto_detach(sc); LAGG_XUNLOCK(sc); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_SX_DESTROY(sc); free(sc, M_LAGG); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap, ena, pena; uint64_t hwa; struct ifnet_hw_tsomax hw_tsomax; LAGG_XLOCK_ASSERT(sc); /* Get common enabled capabilities for the lagg ports */ ena = ~0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ena &= lp->lp_ifp->if_capenable; ena = (ena == ~0 ? 0 : ena); /* * Apply common enabled capabilities back to the lagg ports. * May require several iterations if they are dependent. */ do { pena = ena; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setcaps(lp, ena); ena &= lp->lp_ifp->if_capenable; } } while (pena != ena); /* Get other capabilities from the lagg ports */ cap = ~0; hwa = ~(uint64_t)0; memset(&hw_tsomax, 0, sizeof(hw_tsomax)); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; hwa &= lp->lp_ifp->if_hwassist; if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } cap = (cap == ~0 ? 0 : cap); hwa = (hwa == ~(uint64_t)0 ? 0 : hwa); if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; struct ifreq ifr; int error, i, oldmtu; uint64_t *pval; LAGG_XLOCK_ASSERT(sc); if (sc->sc_ifp == ifp) { if_printf(sc->sc_ifp, "cannot add a lagg to itself as a port\n"); return (EINVAL); } /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return (EPROTONOSUPPORT); /* Allow the first Ethernet member to define the MTU */ oldmtu = -1; if (CK_SLIST_EMPTY(&sc->sc_ports)) { sc->sc_ifp->if_mtu = ifp->if_mtu; } else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if (ifp->if_ioctl == NULL) { if_printf(sc->sc_ifp, "cannot change MTU for %s\n", ifp->if_xname); return (EINVAL); } oldmtu = ifp->if_mtu; strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_mtu = sc->sc_ifp->if_mtu; error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); if (error != 0) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (error); } ifr.ifr_mtu = oldmtu; } lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ LAGG_LIST_LOCK(); SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (E2BIG); } #endif } } LAGG_LIST_UNLOCK(); if_ref(ifp); lp->lp_ifp = ifp; bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); lp->lp_ifcapenable = ifp->if_capenable; if (CK_SLIST_EMPTY(&sc->sc_ports)) { bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } else { if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); } lagg_setflags(lp, 1); if (CK_SLIST_EMPTY(&sc->sc_ports)) sc->sc_primary = lp; /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = IFT_IEEE8023ADLAG; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; /* Read port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) *pval = ifp->if_get_counter(ifp, i); /* * Insert into the list of ports. * Keep ports sorted by if_index. It is handy, when configuration * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( CK_SLIST_NEXT(tlp, lp_entries) == NULL || ((struct lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index > ifp->if_index)) break; } if (tlp != NULL) CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries); else CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; lagg_setmulti(lp); if ((error = lagg_proto_addport(sc, lp)) != 0) { /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (error); } /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_SXLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static void lagg_port_destroy_cb(epoch_context_t ec) { struct lagg_port *lp; struct ifnet *ifp; lp = __containerof(ec, struct lagg_port, lp_epoch_ctx); ifp = lp->lp_ifp; if_rele(ifp); free(lp, M_LAGG); } static int lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr, *lp0; struct ifnet *ifp = lp->lp_ifp; uint64_t *pval, vdiff; int i; LAGG_XLOCK_ASSERT(sc); if (rundelport) lagg_proto_delport(sc, lp); if (lp->lp_detaching == 0) lagg_clrmulti(lp); /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Update detached port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) { vdiff = ifp->if_get_counter(ifp, i) - *pval; sc->detached_counters.val[i] += vdiff; } /* Finally, remove the port from the lagg */ CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[ETHER_ADDR_LEN]; if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL) bzero(&lladdr, ETHER_ADDR_LEN); else bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN); sc->sc_primary = lp0; if (sc->sc_destroying == 0) { bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } /* * Update lladdr for each port (new primary needs update * as well, to switch from old lladdr to its 'real' one) */ CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN); } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); if (lp->lp_detaching == 0) { lagg_setflags(lp, 0); lagg_setcaps(lp, lp->lp_ifcapenable); if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN); } /* * free port and release it's ifnet reference after a grace period has * elapsed. */ - epoch_call(net_epoch_preempt, &lp->lp_epoch_ctx, lagg_port_destroy_cb); + NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; /* Should be checked by the caller */ if (ifp->if_type != IFT_IEEE8023ADLAG || (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); break; case SIOCSIFCAP: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_XLOCK(sc); lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp != NULL && lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * Requests counter @cnt data. * * Counter value is calculated the following way: * 1) for each port, sum difference between current and "initial" measurements. * 2) add lagg logical interface counters. * 3) add data from detached_counters array. * * We also do the following things on ports attach/detach: * 1) On port attach we store all counters it has into port_counter array. * 2) On port detach we add the different between "initial" and * current counters data to detached_counters array. */ static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt) { struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lpifp; uint64_t newval, oldval, vsum; /* Revise this when we've got non-generic counters. */ KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); sc = (struct lagg_softc *)ifp->if_softc; vsum = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* Saved attached value */ oldval = lp->port_counters.val[cnt]; /* current value */ lpifp = lp->lp_ifp; newval = lpifp->if_get_counter(lpifp, cnt); /* Calculate diff and save new */ vsum += newval - oldval; } LAGG_RUNLOCK(); /* * Add counter data which might be added by upper * layer protocols operating on logical interface. */ vsum += if_get_counter_default(ifp, cnt); /* * Add counter data from detached ports counters */ vsum += sc->detached_counters.val[cnt]; return (vsum); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_XLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; struct lagg_port *lp; LAGG_XLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { LAGG_XUNLOCK(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; /* * Update the port lladdrs if needed. * This might be if_setlladdr() notification * that lladdr has been changed. */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp), ETHER_ADDR_LEN) != 0) if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN); } lagg_proto_init(sc); LAGG_XUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_XLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0; bzero(&rpbuf, sizeof(rpbuf)); switch (cmd) { case SIOCGLAGG: LAGG_XLOCK(sc); buflen = sc->sc_count * sizeof(struct lagg_reqport); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); ra->ra_proto = sc->sc_proto; lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_XUNLOCK(sc); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } LAGG_XLOCK(sc); lagg_proto_detach(sc); LAGG_UNLOCK_ASSERT(); lagg_proto_attach(sc, ra->ra_proto); LAGG_XUNLOCK(sc); break; case SIOCGLAGGOPTS: LAGG_XLOCK(sc); ro->ro_opts = sc->sc_opts; if (sc->sc_proto == LAGG_PROTO_LACP) { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; if (lsc->lsc_debug.lsc_tx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_TXTEST; if (lsc->lsc_debug.lsc_rx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; if (lsc->lsc_fast_timeout != 0) ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT; ro->ro_active = sc->sc_active; } else { ro->ro_active = 0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } ro->ro_bkt = sc->sc_stride; ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; LAGG_XUNLOCK(sc); break; case SIOCSLAGGOPTS: error = priv_check(td, PRIV_NET_LAGG); if (error) break; /* * The stride option was added without defining a corresponding * LAGG_OPT flag, so handle a non-zero value before checking * anything else to preserve compatibility. */ LAGG_XLOCK(sc); if (ro->ro_opts == 0 && ro->ro_bkt != 0) { if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) { LAGG_XUNLOCK(sc); error = EINVAL; break; } sc->sc_stride = ro->ro_bkt; } if (ro->ro_opts == 0) { LAGG_XUNLOCK(sc); break; } /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. */ int valid, lacp; switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_USE_NUMA: case -LAGG_OPT_USE_NUMA: case LAGG_OPT_FLOWIDSHIFT: case LAGG_OPT_RR_LIMIT: valid = 1; lacp = 0; break; case LAGG_OPT_LACP_TXTEST: case -LAGG_OPT_LACP_TXTEST: case LAGG_OPT_LACP_RXTEST: case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: case LAGG_OPT_LACP_TIMEOUT: case -LAGG_OPT_LACP_TIMEOUT: valid = lacp = 1; break; default: valid = lacp = 0; break; } if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ error = EINVAL; LAGG_XUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } /* * Store new options into sc->sc_opts except for * FLOWIDSHIFT, RR and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) { if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN || ro->ro_bkt == 0) { error = EINVAL; LAGG_XUNLOCK(sc); break; } sc->sc_stride = ro->ro_bkt; } else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; struct lacp_port *lp; lsc = (struct lacp_softc *)sc->sc_psc; switch (ro->ro_opts) { case LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 1; break; case -LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 0; break; case LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 1; break; case -LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 0; break; case LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 1; break; case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; case LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state |= LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 1; break; case -LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state &= ~LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 0; break; } } LAGG_XUNLOCK(sc); break; case SIOCGLAGGFLAGS: rf->rf_flags = 0; LAGG_XLOCK(sc); if (sc->sc_flags & MBUF_HASHFLAG_L2) rf->rf_flags |= LAGG_F_HASHL2; if (sc->sc_flags & MBUF_HASHFLAG_L3) rf->rf_flags |= LAGG_F_HASHL3; if (sc->sc_flags & MBUF_HASHFLAG_L4) rf->rf_flags |= LAGG_F_HASHL4; LAGG_XUNLOCK(sc); break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_XLOCK(sc); sc->sc_flags = 0; if (rf->rf_flags & LAGG_F_HASHL2) sc->sc_flags |= MBUF_HASHFLAG_L2; if (rf->rf_flags & LAGG_F_HASHL3) sc->sc_flags |= MBUF_HASHFLAG_L3; if (rf->rf_flags & LAGG_F_HASHL4) sc->sc_flags |= MBUF_HASHFLAG_L4; LAGG_XUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); if_rele(tpif); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); if_rele(tpif); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } #ifdef INET6 /* * A laggport interface should not have inet6 address * because two interfaces with a valid link-local * scope zone must not be merged in any form. This * restriction is needed to prevent violation of * link-local scope zone. Attempts to add a laggport * interface which has inet6 addresses triggers * removal of all inet6 addresses on the member * interface. */ if (in6ifa_llaonifp(tpif)) { in6_ifdetach(tpif); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", tpif->if_xname); } #endif LAGG_XLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_XLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_XUNLOCK(sc); if_rele(tpif); break; } error = lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ lagg_stop(sc); LAGG_XUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ LAGG_XUNLOCK(sc); (*ifp->if_init)(sc); } else LAGG_XUNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_clrmulti(lp); lagg_setmulti(lp); } LAGG_XUNLOCK(sc); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(ifp); error = 0; break; case SIOCSIFMTU: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); else error = EINVAL; if (error != 0) { if_printf(ifp, "failed to change MTU to %d on port %s, " "reverting all ports to original MTU (%d)\n", ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu); break; } } if (error == 0) { ifp->if_mtu = ifr->ifr_mtu; } else { /* set every port back to the original MTU */ ifr->ifr_mtu = ifp->if_mtu; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } } LAGG_XUNLOCK(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) static inline struct lagg_snd_tag * mst_to_lst(struct m_snd_tag *mst) { return (__containerof(mst, struct lagg_snd_tag, com)); } /* * Look up the port used by a specific flow. This only works for lagg * protocols with deterministic port mappings (e.g. not roundrobin). * In addition protocols which use a hash to map flows to ports must * be configured to use the mbuf flowid rather than hashing packet * contents. */ static struct lagg_port * lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype) { struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t p; sc = ifp->if_softc; switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: return (lagg_link_active(sc, sc->sc_primary)); case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || flowtype == M_HASHTYPE_NONE) return (NULL); p = flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; return (lagg_link_active(sc, lp)); case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || flowtype == M_HASHTYPE_NONE) return (NULL); return (lacp_select_tx_port_by_hash(sc, flowid)); default: return (NULL); } } static int lagg_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct lagg_snd_tag *lst; struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lp_ifp; int error; sc = ifp->if_softc; LAGG_RLOCK(); lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype); if (lp == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); } if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); } lp_ifp = lp->lp_ifp; if_ref(lp_ifp); LAGG_RUNLOCK(); lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT); if (lst == NULL) { if_rele(lp_ifp); return (ENOMEM); } error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag); if_rele(lp_ifp); if (error) { free(lst, M_LAGG); return (error); } m_snd_tag_init(&lst->com, ifp); *ppmt = &lst->com; return (0); } static int lagg_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params)); } static int lagg_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); return (lst->tag->ifp->if_snd_tag_query(lst->tag, params)); } static void lagg_snd_tag_free(struct m_snd_tag *mst) { struct lagg_snd_tag *lst; lst = mst_to_lst(mst); m_snd_tag_rele(lst->tag); free(lst, M_LAGG); } static void lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) { /* * For lagg, we have an indirect * interface. The caller needs to * get a ratelimit tag on the actual * interface the flow will go on. */ q->rate_table = NULL; q->flags = RT_IS_INDIRECT; q->max_flows = 0; q->number_of_rates = 0; } #endif static int lagg_setmulti(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; IF_ADDR_WLOCK(scifp); CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } return (0); } static int lagg_clrmulti(struct lagg_port *lp) { struct lagg_mc *mc; LAGG_XLOCK_ASSERT(lp->lp_softc); while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); free(mc, M_LAGG); } return (0); } static int lagg_setcaps(struct lagg_port *lp, int cap) { struct ifreq ifr; if (lp->lp_ifp->if_capenable == cap) return (0); if (lp->lp_ioctl == NULL) return (ENXIO); ifr.ifr_reqcap = cap; return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr)); } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_XLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) MPASS(m->m_pkthdr.snd_tag->ifp == ifp); #endif LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); LAGG_RUNLOCK(); if (error != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; LAGG_RLOCK(); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || lp->lp_detaching != 0 || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); m = lagg_proto_input(sc, lp, m); if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { m_freem(m); m = NULL; } LAGG_RUNLOCK(); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } LAGG_RUNLOCK(); } static void lagg_linkstate(struct lagg_softc *sc) { struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; LAGG_XLOCK_ASSERT(sc); /* LACP handles link state itself */ if (sc->sc_proto == LAGG_PROTO_LACP) return; /* Our link is considered up if at least one of our ports is active */ LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } LAGG_RUNLOCK(); if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: speed = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; LAGG_RUNLOCK(); sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_XLOCK(sc); lagg_linkstate(sc); lagg_proto_linkstate(sc, lp); LAGG_XUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; /* * Search a port which reports an active link state. */ #ifdef INVARIANTS /* * This is called with either LAGG_RLOCK() held or * LAGG_XLOCK(sc) held. */ if (!in_epoch(net_epoch_preempt)) LAGG_XLOCK_ASSERT(sc); #endif if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { return (lp_next); } } found: return (rval); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct lagg_snd_tag *lst; struct m_snd_tag *mst; mst = m->m_pkthdr.snd_tag; lst = mst_to_lst(mst); if (lst->tag->ifp != ifp) { m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag); m_snd_tag_rele(mst); } #endif return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_seq = 0; sc->sc_stride = 1; } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; p = atomic_fetchadd_32(&sc->sc_seq, 1); p /= sc->sc_stride; p %= sc->sc_count; lp = CK_SLIST_FIRST(&sc->sc_ports); while (p--) lp = CK_SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Broadcast mode */ static int lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { int active_ports = 0; int errors = 0; int ret; struct lagg_port *lp, *last = NULL; struct mbuf *m0; LAGG_RLOCK_ASSERT(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; active_ports++; if (last != NULL) { m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m0 == NULL) { ret = ENOBUFS; errors++; break; } ret = lagg_enqueue(last->lp_ifp, m0); if (ret != 0) errors++; } last = lp; } if (last == NULL) { m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); if (ret != 0) errors++; if (errors == 0) return (ret); return (0); } static struct mbuf* lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've received a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); } static void lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb; lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) free(lb, M_LAGG); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0, rv; rv = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) { rv = EINVAL; break; } if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } return (rv); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; lacp_attach(sc); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; void *psc; LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); psc = sc->sc_psc; sc->sc_psc = NULL; lacp_detach(psc); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_SXLOCK_ASSERT(sc); /* purge all the lacp ports */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; lp = lacp_select_tx_port(sc, m); if (lp == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } Index: head/sys/net/if_vlan.c =================================================================== --- head/sys/net/if_vlan.c (revision 356754) +++ head/sys/net/if_vlan.c (revision 356755) @@ -1,2079 +1,2079 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * Copyright 2012 ADARA Networks, Inc. * Copyright 2017 Dell EMC Isilon * * Portions of this software were developed by Robert N. M. Watson under * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. * This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that * ether_output() sends to us via if_transmit(), rewrite them for * use by the real outgoing interface, and ask it to send them. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_kern_tls.h" #include "opt_vlan.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ #else struct ifvlanhead *hash; /* dynamic hash-list table */ uint16_t hmask; uint16_t hwidth; #endif int refcnt; }; #if defined(KERN_TLS) || defined(RATELIMIT) struct vlan_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; static inline struct vlan_snd_tag * mst_to_vst(struct m_snd_tag *mst) { return (__containerof(mst, struct vlan_snd_tag, com)); } #endif /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. */ #ifdef VLAN_ARRAY #define VLAN_FOREACH(_ifv, _trunk) \ size_t _i; \ for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i]) != NULL) #else /* VLAN_ARRAY */ #define VLAN_FOREACH(_ifv, _trunk) \ struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* * This macro provides a facility to iterate over every vlan on a trunk while * also modifying the number of vlans on the trunk. The iteration continues * until some condition is met or there are no more vlans on the trunk. */ #ifdef VLAN_ARRAY /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i])) #else /* VLAN_ARRAY */ /* * The hash table case is more complicated. We allow for the hash table to be * modified (i.e. vlans removed) while we are iterating over it. To allow for * this we must restart the iteration every time we "touch" something during * the iteration, since removal will resize the hash table and invalidate our * current position. If acting on the touched element causes the trunk to be * emptied, then iteration also stops. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ bool _touch = false; \ for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; struct epoch_context mc_epoch_ctx; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) ((ifv)->ifv_trunk->parent) void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; int ifv_encaplen; /* encapsulation length */ int ifv_mtufudge; /* MTU fudged by this much */ int ifv_mintu; /* min transmission unit */ uint16_t ifv_proto; /* encapsulation ethertype */ uint16_t ifv_tag; /* tag to apply on packets leaving if */ uint16_t ifv_vid; /* VLAN ID */ uint8_t ifv_pcp; /* Priority Code Point (PCP). */ struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; /* Special flags we should propagate to parent. */ static struct { int flag; int (*func)(struct ifnet *, int); } vlan_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; extern int vlan_mtag_pcp; static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; /* * if_vlan uses two module-level synchronizations primitives to allow concurrent * modification of vlan interfaces and (mostly) allow for vlans to be destroyed * while they are being used for tx/rx. To accomplish this in a way that has * acceptable performance and cooperation with other parts of the network stack * there is a non-sleepable epoch(9) and an sx(9). * * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * */ #define _VLAN_SX_ID ifv_sx static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ sx_init(&_VLAN_SX_ID, "vlan_sx") #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) #define VLAN_XUNLOCK() sx_xunlock(&_VLAN_SX_ID) #define VLAN_SLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_SLOCKED) #define VLAN_XLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_XLOCKED) #define VLAN_SXLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_LOCKED) /* * We also have a per-trunk mutex that should be acquired when changing * its state. */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array * with 4096 entries. In theory this can give a boost in processing, * however in practice it does not. Probably this is because the array * is too big to fit into CPU cache. */ #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); static void vlan_freehash(struct ifvlantrunk *trunk); static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int vlan_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int vlan_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void vlan_snd_tag_free(struct m_snd_tag *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); static struct ifnet *vlan_clone_match_ethervid(const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); static int vlan_clone_destroy(struct if_clone *, struct ifnet *); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); static void vlan_lladdr_fn(void *arg, int pending); static struct if_clone *vlan_cloner; #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif static void vlan_mc_free(struct epoch_context *ctx) { struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx); free(mc, M_VLAN); } #ifndef VLAN_ARRAY #define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) static void vlan_inithash(struct ifvlantrunk *trunk) { int i, n; /* * The trunk must not be locked here since we call malloc(M_WAITOK). * It is OK in case this function is called before the trunk struct * gets hooked up and becomes visible from other threads. */ KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, ("%s: hash already initialized", __func__)); trunk->hwidth = VLAN_DEF_HWIDTH; n = 1 << trunk->hwidth; trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) CK_SLIST_INIT(&trunk->hash[i]); } static void vlan_freehash(struct ifvlantrunk *trunk) { #ifdef INVARIANTS int i; KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); trunk->hash = NULL; trunk->hwidth = trunk->hmask = 0; } static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* * Grow the hash when the number of vlans exceeds half of the number of * hash buckets squared. This will make the average linked-list length * buckets/2. */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); } static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); } panic("%s: vlan not found\n", __func__); return (ENOENT); /*NOTREACHED*/ } /* * Grow the hash larger or smaller if memory permits. */ static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch) { struct ifvlan *ifv; struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { /* Harmless yet obvious coding error */ printf("%s: howmuch is 0\n", __func__); return; } hwidth2 = trunk->hwidth + howmuch; n = 1 << trunk->hwidth; n2 = 1 << hwidth2; /* Do not shrink the table below the default */ if (hwidth2 < VLAN_DEF_HWIDTH) return; hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; trunk->hmask = n2 - 1; if (bootverbose) if_printf(trunk->parent, "VLAN hash table resized from %d to %d buckets\n", n, n2); } static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); } #if 0 /* Debugging code to view the hashtables. */ static void vlan_dumphash(struct ifvlantrunk *trunk) { int i; struct ifvlan *ifv; for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } } #endif /* 0 */ #else static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); } static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); } static __inline void vlan_freehash(struct ifvlantrunk *trunk) { } static __inline void vlan_inithash(struct ifvlantrunk *trunk) { } #endif /* !VLAN_ARRAY */ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; TRUNK_LOCK_DESTROY(trunk); if_rele(trunk->parent); free(trunk, M_VLAN); } /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the * side effect of causing the parent interface to receive multicast * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); - epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free); + NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } /* Now program new ones. */ IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(ifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) return (error); } CURVNET_RESTORE(); return (0); } /* * A handler for parent interface link layer address changes. * If the parent interface link layer address is changed we * should also change it on all children vlans. */ static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; /* Need the epoch since this is run on taskqueue_swi. */ NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } /* * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR * ioctl calls on the parent garbling the lladdr of the child vlan. */ TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { /* * Copy new new lladdr into the ifv_ifp, enqueue a task * to actually call if_setlladdr. if_setlladdr needs to * be deferred to a taskqueue because it will call into * the if_vlan ioctl path and try to acquire the global * lock. */ ifv_ifp = ifv->ifv_ifp; bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp), ifp->if_addrlen); sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr; sdl->sdl_alen = ifp->if_addrlen; taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for network interface departure events. * Track departure of trunks here so that we don't access invalid * pointers or whatever if a trunk is ripped from under us, e.g., * by ejecting its hot-plug card. However, if an ifnet is simply * being renamed, then there's no need to tear down the state. */ static void vlan_ifdetach(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; struct ifvlantrunk *trunk; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; VLAN_XLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_XUNLOCK(); return; } /* * OK, it's a trunk. Loop over and detach all vlan's on it. * Check trunk pointer after each vlan_unconfig() as it will * free it and set to NULL after the last vlan was detached. */ VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk, ifp->if_vlantrunk == NULL) vlan_unconfig_locked(ifv->ifv_ifp, 1); /* Trunk should have been destroyed in vlan_unconfig(). */ KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); VLAN_XUNLOCK(); } /* * Return the trunk device for a virtual interface. */ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); return (ifp); } /* * Return the 12-bit VLAN VID for this interface, for use by external * components such as Infiniband. * * XXXRW: Note that the function name here is historical; it should be named * vlan_vid(). */ static int vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *vidp = ifv->ifv_vid; return (0); } static int vlan_pcp(struct ifnet *ifp, uint16_t *pcpp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *pcpp = ifv->ifv_pcp; return (0); } /* * Return a driver specific cookie for this interface. Synchronization * with setcookie must be provided by the driver. */ static void * vlan_cookie(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; return (ifv->ifv_cookie); } /* * Store a cookie in our softc that drivers can use to store driver * private per-instance data in. */ static int vlan_setcookie(struct ifnet *ifp, void *cookie) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; ifv->ifv_cookie = cookie; return (0); } /* * Return the vlan device present at the specific VID. */ static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) return (NULL); ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; return (ifp); } /* * Recalculate the cached VLAN tag exposed via the MIB. */ static void vlan_tag_recalculate(struct ifvlan *ifv) { ifv->ifv_tag = EVL_MAKETAG(ifv->ifv_vid, ifv->ifv_pcp, 0); } /* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and * set here. No one else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* For if_link_state_change() eyes only... */ extern void (*vlan_link_state_p)(struct ifnet *); static int vlan_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (ifdetach_tag == NULL) return (ENOMEM); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); if (iflladdr_tag == NULL) return (ENOMEM); VLAN_LOCKING_INIT(); vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; vlan_trunkdev_p = vlan_trunkdev; vlan_cookie_p = vlan_cookie; vlan_setcookie_p = vlan_setcookie; vlan_tag_p = vlan_tag; vlan_pcp_p = vlan_pcp; vlan_devat_p = vlan_devat; #ifndef VIMAGE vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); #endif if (bootverbose) printf("vlan: initialized, using " #ifdef VLAN_ARRAY "full-size arrays" #else "hash tables with chaining" #endif "\n"); break; case MOD_UNLOAD: #ifndef VIMAGE if_clone_detach(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; vlan_cookie_p = NULL; vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCKING_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t vlan_mod = { "if_vlan", vlan_modevent, 0 }; DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vlan, 3); #ifdef VIMAGE static void vnet_vlan_init(const void *unused __unused) { vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); V_vlan_cloner = vlan_cloner; } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); static void vnet_vlan_uninit(const void *unused __unused) { if_clone_detach(V_vlan_cloner); } VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_vlan_uninit, NULL); #endif /* * Check for . style interface names. */ static struct ifnet * vlan_clone_match_ethervid(const char *name, int *vidp) { char ifname[IFNAMSIZ]; char *cp; struct ifnet *ifp; int vid; strlcpy(ifname, name, IFNAMSIZ); if ((cp = strchr(ifname, '.')) == NULL) return (NULL); *cp = '\0'; if ((ifp = ifunit_ref(ifname)) == NULL) return (NULL); /* Parse VID. */ if (*++cp == '\0') { if_rele(ifp); return (NULL); } vid = 0; for(; *cp >= '0' && *cp <= '9'; cp++) vid = (vid * 10) + (*cp - '0'); if (*cp != '\0') { if_rele(ifp); return (NULL); } if (vidp != NULL) *vidp = vid; return (ifp); } static int vlan_clone_match(struct if_clone *ifc, const char *name) { const char *cp; if (vlan_clone_match_ethervid(name, NULL) != NULL) return (1); if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int wildcard; int unit; int error; int vid; struct ifvlan *ifv; struct ifnet *ifp; struct ifnet *p; struct ifaddr *ifa; struct sockaddr_dl *sdl; struct vlanreq vlr; static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * There are 3 (ugh) ways to specify the cloned device: * o pass a parameter block with the clone request. * o specify parameters in the text of the clone device name * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter two are * supported for backwards compatibility. * * XXXRW: Note historic use of the word "tag" here. New ioctls may be * called for. */ if (params) { error = copyin(params, &vlr, sizeof(vlr)); if (error) return error; p = ifunit_ref(vlr.vlr_parent); if (p == NULL) return (ENXIO); error = ifc_name2unit(name, &unit); if (error != 0) { if_rele(p); return (error); } vid = vlr.vlr_tag; wildcard = (unit < 0); } else if ((p = vlan_clone_match_ethervid(name, &vid)) != NULL) { unit = -1; wildcard = 0; } else { p = NULL; error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); } error = ifc_alloc_unit(ifc, &unit); if (error != 0) { if (p != NULL) if_rele(p); return (error); } /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { panic("%s: interface name too long", __func__); } } ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { ifc_free_unit(ifc, unit); free(ifv, M_VLAN); if (p != NULL) if_rele(p); return (ENOSPC); } CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; ifp->if_init = vlan_init; ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; ifp->if_snd_tag_modify = vlan_snd_tag_modify; ifp->if_snd_tag_query = vlan_snd_tag_query; ifp->if_snd_tag_free = vlan_snd_tag_free; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_L2VLAN; if (p != NULL) { error = vlan_config(ifv, p, vid); if_rele(p); if (error != 0) { /* * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ ether_ifdetach(ifp); vlan_unconfig(ifp); if_free(ifp); ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (error); } } return (0); } static int vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct ifvlan *ifv = ifp->if_softc; int unit = ifp->if_dunit; ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ /* * We should have the only reference to the ifv now, so we can now * drain any remaining lladdr task before freeing the ifnet and the * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); ifc_free_unit(ifc, unit); return (0); } /* * The ifp->if_init entry point for vlan(4) is a no-op. */ static void vlan_init(void *foo __unused) { } /* * The if_transmit method for vlan(4) interface. */ static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; NET_EPOCH_ASSERT(); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } p = PARENT(ifv); len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; BPF_MTAP(ifp, m); #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct vlan_snd_tag *vst; struct m_snd_tag *mst; MPASS(m->m_pkthdr.snd_tag->ifp == ifp); mst = m->m_pkthdr.snd_tag; vst = mst_to_vst(mst); if (vst->tag->ifp != p) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); m_snd_tag_rele(mst); } #endif /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, ifv->ifv_vid, ifv->ifv_pcp)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (0); } /* * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ifvlan *ifv; struct ifnet *p; NET_EPOCH_ASSERT(); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { m_freem(m); return (ENETDOWN); } p = PARENT(ifv); return p->if_output(ifp, m, dst, ro); } /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ static void vlan_qflush(struct ifnet *ifp __unused) { } static void vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { m_freem(m); return; } if (m->m_flags & M_VLANTAG) { /* * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; /* * Packet is tagged in-band as specified by 802.1q. */ switch (ifp->if_type) { case IFT_ETHER: if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); return; } evl = mtod(m, struct ether_vlan_header *); tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); break; default: #ifdef INVARIANTS panic("%s: %s has unsupported if_type %u", __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } } vid = EVL_VLANOFTAG(tag); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } if (vlan_mtag_pcp) { /* * While uncommon, it is possible that we will find a 802.1q * packet encapsulated inside another packet that also had an * 802.1q header. For example, ethernet tunneled over IPSEC * arriving over ethernet. In that case, we replace the * existing 802.1q PCP m_tag value. */ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); } m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); } static void vlan_lladdr_fn(void *arg, int pending __unused) { struct ifvlan *ifv; struct ifnet *ifp; ifv = (struct ifvlan *)arg; ifp = ifv->ifv_ifp; CURVNET_SET(ifp->if_vnet); /* The ifv_ifp already has the lladdr copied in. */ if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen); CURVNET_RESTORE(); } static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; /* * We can handle non-ethernet hardware types as long as * they handle the tagging and headers themselves. */ if (p->if_type != IFT_ETHER && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. * VID numbers 0x0 and 0xFFF are reserved. */ if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK)) return (EINVAL); if (ifv->ifv_trunk) return (EBUSY); VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); TRUNK_WUNLOCK(trunk); } else { trunk = p->if_vlantrunk; } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ ifv->ifv_pcp = 0; /* Default: best effort delivery. */ vlan_tag_recalculate(ifv); error = vlan_inshash(trunk, ifv); if (error) goto done; ifv->ifv_proto = ETHERTYPE_VLAN; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; ifv->ifv_capenable = -1; /* * If the parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, * use it. */ if (p->if_capenable & IFCAP_VLAN_MTU) { /* * No need to fudge the MTU since the parent can * handle extended frames. */ ifv->ifv_mtufudge = 0; } else { /* * Fudge the MTU by the encapsulation size. This * makes us incompatible with strictly compliant * 802.1Q implementations, but allows us to use * the feature with other NetBSD implementations, * which might still be useful. */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; /* * Initialize fields from our parent. This duplicates some * work with ether_ifattach() but allows for non-ethernet * interfaces to also work. */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; ifp->if_broadcastaddr = p->if_broadcastaddr; ifp->if_pcp = ifv->ifv_pcp; /* * We wrap the parent's if_output using vlan_output to ensure that it * can't become stale. */ ifp->if_output = vlan_output; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ #define VLAN_COPY_FLAGS (IFF_SIMPLEX) ifp->if_flags &= ~VLAN_COPY_FLAGS; ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; #undef VLAN_COPY_FLAGS ifp->if_link_state = p->if_link_state; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying * physical interface's. */ TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; /* * Do not schedule link address update if it was the same * as previous parent's. This helps avoid updating for each * associated llentry. */ if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) { bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); /* * Configure multicast addresses that may already be * joined on the vlan device. */ (void)vlan_setmulti(ifp); done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); return (error); } static void vlan_unconfig(struct ifnet *ifp) { VLAN_XLOCK(); vlan_unconfig_locked(ifp, 0); VLAN_XUNLOCK(); } static void vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; int error; VLAN_XLOCK_ASSERT(); ifv = ifp->if_softc; trunk = ifv->ifv_trunk; parent = NULL; if (trunk != NULL) { parent = trunk->parent; /* * Since the interface is being unconfigured, we need to * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already * been removed. Warn about errors if * if_delmulti() does fail, but don't abort as * all callers expect vlan destruction to * succeed. */ if (!departing) { error = if_delmulti(parent, (struct sockaddr *)&mc->mc_addr); if (error) if_printf(ifp, "Failed to delete multicast address from parent: %d\n", error); } CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); - epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free); + NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } vlan_setflags(ifp, 0); /* clear special flags on parent */ vlan_remhash(trunk, ifv); ifv->ifv_trunk = NULL; /* * Check if we were the last. */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; NET_EPOCH_WAIT(); trunk_destroy(trunk); } } /* Disconnect from parent. */ if (ifv->ifv_pflags) if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); ifp->if_mtu = ETHERMTU; ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Only dispatch an event if vlan was * attached, otherwise there is nothing * to cleanup anyway. */ if (parent != NULL) EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)) { struct ifvlan *ifv; int error; VLAN_SXLOCK_ASSERT(); ifv = ifp->if_softc; status = status ? (ifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded parent's status is different from what * we want it to be. If it is, flip it. We record parent's * status in ifv_pflags so that we won't clear parent's flag * we haven't set. In fact, we don't clear or set parent's * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual parent's flags. */ if (status != (ifv->ifv_pflags & flag)) { error = (*func)(PARENT(ifv), status); if (error) return (error); ifv->ifv_pflags &= ~flag; ifv->ifv_pflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the parent: * if "status" is true, update parent's flags respective to our if_flags; * if "status" is false, forcedly clear the flags set on parent. */ static int vlan_setflags(struct ifnet *ifp, int status) { int error, i; for (i = 0; vlan_pflags[i].flag; i++) { error = vlan_setflag(ifp, vlan_pflags[i].flag, status, vlan_pflags[i].func); if (error) return (error); } return (0); } /* Inform all vlans that their parent has changed link state */ static void vlan_link_state(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; if_link_state_change(ifv->ifv_ifp, trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p; struct ifnet *ifp; struct ifnet_hw_tsomax hw_tsomax; int cap = 0, ena = 0, mena; u_long hwa = 0; NET_EPOCH_ASSERT(); VLAN_SXLOCK_ASSERT(); p = PARENT(ifv); ifp = ifv->ifv_ifp; /* Mask parent interface enabled capabilities disabled by user. */ mena = p->if_capenable & ifv->ifv_capenable; /* * If the parent interface can do checksum offloading * on VLANs, then propagate its hardware-assisted * checksumming flags. Also assert that checksum * offloading requires hardware VLAN tagging. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (p->if_capenable & IFCAP_VLAN_HWCSUM && p->if_capenable & IFCAP_VLAN_HWTAGGING) { ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (ena & IFCAP_TXCSUM) hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); if (ena & IFCAP_TXCSUM_IPV6) hwa |= p->if_hwassist & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } /* * If the parent interface can do TSO on VLANs then * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ memset(&hw_tsomax, 0, sizeof(hw_tsomax)); if_hw_tsomax_common(p, &hw_tsomax); if_hw_tsomax_update(ifp, &hw_tsomax); if (p->if_capabilities & IFCAP_VLAN_HWTSO) cap |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { ena |= mena & IFCAP_TSO; if (ena & IFCAP_TSO) hwa |= p->if_hwassist & CSUM_TSO; } /* * If the parent interface can do LRO and checksum offloading on * VLANs, then guess it may do LRO on VLANs. False positive here * cost nothing, while false negative may lead to some confusions. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & IFCAP_LRO; if (p->if_capenable & IFCAP_VLAN_HWCSUM) ena |= p->if_capenable & IFCAP_LRO; /* * If the parent interface can offload TCP connections over VLANs then * propagate its TOE capability to the VLAN interface. * * All TOE drivers in the tree today can deal with VLANs. If this * changes then IFCAP_VLAN_TOE should be promoted to a full capability * with its own bit. */ #define IFCAP_VLAN_TOE IFCAP_TOE if (p->if_capabilities & IFCAP_VLAN_TOE) cap |= p->if_capabilities & IFCAP_TOE; if (p->if_capenable & IFCAP_VLAN_TOE) { TOEDEV(ifp) = TOEDEV(p); ena |= mena & IFCAP_TOE; } /* * If the parent interface supports dynamic link state, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_LINKSTATE); ena |= (mena & IFCAP_LINKSTATE); #ifdef RATELIMIT /* * If the parent interface supports ratelimiting, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_TXRTLMT); ena |= (mena & IFCAP_TXRTLMT); #endif /* * If the parent interface supports unmapped mbufs, so does * the VLAN interface. Note that this should be fine even for * interfaces that don't support hardware tagging as headers * are prepended in normal mbufs to unmapped mbufs holding * payload data. */ cap |= (p->if_capabilities & IFCAP_NOMAP); ena |= (mena & IFCAP_NOMAP); /* * If the parent interface can offload encryption and segmentation * of TLS records over TCP, propagate it's capability to the VLAN * interface. * * All TLS drivers in the tree today can deal with VLANs. If * this ever changes, then a new IFCAP_VLAN_TXTLS can be * defined. */ if (p->if_capabilities & IFCAP_TXTLS) cap |= p->if_capabilities & IFCAP_TXTLS; if (p->if_capenable & IFCAP_TXTLS) ena |= mena & IFCAP_TXTLS; ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; } static void vlan_trunk_capabilities(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; VLAN_SLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_SUNLOCK(); return; } NET_EPOCH_ENTER(et); VLAN_FOREACH(ifv, trunk) vlan_capabilities(ifv); NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifnet *p; struct ifreq *ifr; struct ifaddr *ifa; struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0; ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; ifv = ifp->if_softc; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ifp->if_addrlen); break; case SIOCGIFMEDIA: VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { p = PARENT(ifv); if_ref(p); error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); if_rele(p); /* Limit the result to the parent's current config. */ if (error == 0) { struct ifmediareq *ifmr; ifmr = (struct ifmediareq *)data; if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { ifmr->ifm_count = 1; error = copyout(&ifmr->ifm_current, ifmr->ifm_ulist, sizeof(int)); } } } else { error = EINVAL; } VLAN_SUNLOCK(); break; case SIOCSIFMEDIA: error = EINVAL; break; case SIOCSIFMTU: /* * Set the interface MTU. */ VLAN_SLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) { TRUNK_WLOCK(trunk); if (ifr->ifr_mtu > (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; TRUNK_WUNLOCK(trunk); } else error = EINVAL; VLAN_SUNLOCK(); break; case SIOCSETVLAN: #ifdef VIMAGE /* * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN * interface to be delegated to a jail without allowing the * jail to change what underlying interface/VID it is * associated with. We are not entirely convinced that this * is the right way to accomplish that policy goal. */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr)); if (error) break; if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; } p = ifunit_ref(vlr.vlr_parent); if (p == NULL) { error = ENOENT; break; } error = vlan_config(ifv, p, vlr.vlr_tag); if_rele(p); break; case SIOCGETVLAN: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif bzero(&vlr, sizeof(vlr)); VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); vlr.vlr_tag = ifv->ifv_vid; } VLAN_SUNLOCK(); error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr)); break; case SIOCSIFFLAGS: /* * We should propagate selected flags to the parent, * e.g., promiscuous mode. */ VLAN_XLOCK(); if (TRUNK(ifv) != NULL) error = vlan_setflags(ifp, 1); VLAN_XUNLOCK(); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * If we don't have a parent, just remember the membership for * when we do. * * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ VLAN_XLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) error = vlan_setmulti(ifp); VLAN_XUNLOCK(); break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif ifr->ifr_vlan_pcp = ifv->ifv_pcp; break; case SIOCSVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = priv_check(curthread, PRIV_NET_SETVLANPCP); if (error) break; if (ifr->ifr_vlan_pcp > 7) { error = EINVAL; break; } ifv->ifv_pcp = ifr->ifr_vlan_pcp; ifp->if_pcp = ifv->ifv_pcp; vlan_tag_recalculate(ifv); /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); break; case SIOCSIFCAP: VLAN_SLOCK(); ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { struct epoch_tracker et; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); } VLAN_SUNLOCK(); break; default: error = EINVAL; break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct epoch_tracker et; struct vlan_snd_tag *vst; struct ifvlan *ifv; struct ifnet *parent; int error; NET_EPOCH_ENTER(et); ifv = ifp->if_softc; if (ifv->ifv_trunk != NULL) parent = PARENT(ifv); else parent = NULL; if (parent == NULL || parent->if_snd_tag_alloc == NULL) { NET_EPOCH_EXIT(et); return (EOPNOTSUPP); } if_ref(parent); NET_EPOCH_EXIT(et); vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); if (vst == NULL) { if_rele(parent); return (ENOMEM); } error = parent->if_snd_tag_alloc(parent, params, &vst->tag); if_rele(parent); if (error) { free(vst, M_VLAN); return (error); } m_snd_tag_init(&vst->com, ifp); *ppmt = &vst->com; return (0); } static int vlan_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params)); } static int vlan_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->ifp->if_snd_tag_query(vst->tag, params)); } static void vlan_snd_tag_free(struct m_snd_tag *mst) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); m_snd_tag_rele(vst->tag); free(vst, M_VLAN); } #endif Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 356754) +++ head/sys/netinet/in.c (revision 356755) @@ -1,1513 +1,1513 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); VNET_DEFINE_STATIC(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) return (1); } return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_ZERONET(i) || IN_LOOPBACK(i)) return (0); return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } NET_EPOCH_EXIT(et); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } NET_EPOCH_EXIT(et); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } /* * Note: we don't need extra reference for ifa, since we called * with sx lock held, and ifaddr can not be deleted in concurrent * thread. */ EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, ifa, IFADDR_EVENT_ADD); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, cmd == SIOCAIFADDR); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); if (ii->ii_allhosts) { (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; /* * XXXME: add fib-aware in_localip. * We definitely don't want to switch between * prefixes in different fibs. */ eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* NET_EPOCH_ENTER(et); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* NET_EPOCH_EXIT(et); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int found; NET_EPOCH_ASSERT(); if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); IN_MULTI_UNLOCK(); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { struct in_multi_head purgeinms; struct in_multi *inm; struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; inm_rele_locked(&purgeinms, inm); if (__predict_false(ifma_restart)) { ifma_restart = true; goto restart; } } IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by the datapath to indicate that * the entry was used. */ static void in_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; LLE_REQ_UNLOCK(lle); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); - epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); + NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); lltable_unlink_entry(llt, lle); } /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rt_addrinfo info; struct sockaddr_in rt_key, rt_mask; struct sockaddr rt_gateway; int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); bzero(&rt_key, sizeof(rt_key)); rt_key.sin_len = sizeof(rt_key); bzero(&rt_mask, sizeof(rt_mask)); rt_mask.sin_len = sizeof(rt_mask); bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sa_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); rt_flags = info.rti_flags; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt_flags & RTF_GATEWAY) { if (!(rt_flags & RTF_HOST) || !info.rti_ifp || info.rti_ifp->if_type != IFT_ETHER || (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { rib_free_info(&info); return (EINVAL); } } rib_free_info(&info); /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; const struct sockaddr_in *l3sin; mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); sa = (const char *)&rt_key; addr = (const char *)l3addr; l3sin = (const struct sockaddr_in *)l3addr; lim = addr + l3sin->sin_len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC char addrbuf[INET_ADDRSTRLEN]; log(LOG_INFO, "IPv4 address: \"%s\" " "is not on the network\n", inet_ntoa_r(l3sin->sin_addr, addrbuf)); #endif return (EINVAL); } } } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { - epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); + NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = in_lltable_mark_used; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 356754) +++ head/sys/netinet/in_pcb.c (revision 356755) @@ -1,3492 +1,3491 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #ifdef INET #include #endif #include #include #ifdef TCPHPTS #include #endif #include #include #ifdef INET6 #include #include #include #include #endif /* INET6 */ #endif #include #include #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); VNET_DEFINE_STATIC(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #ifdef RATELIMIT counter_u64_t rate_limit_active; counter_u64_t rate_limit_alloc_fail; counter_u64_t rate_limit_set_ok; static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0, "IP Rate Limiting"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, &rate_limit_active, "Active rate limited connections"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, &rate_limit_alloc_fail, "Rate limited connection failures"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, &rate_limit_set_ok, "Rate limited setting succeeded"); #endif /* RATELIMIT */ #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size) { struct inpcblbgroup *grp; size_t bytes; bytes = __offsetof(struct inpcblbgroup, il_inp[size]); grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (!grp) return (NULL); grp->il_vflag = vflag; grp->il_lport = port; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } static void in_pcblbgroup_free_deferred(epoch_context_t ctx) { struct inpcblbgroup *grp; grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); free(grp, M_PCB); } static void in_pcblbgroup_free(struct inpcblbgroup *grp) { CK_LIST_REMOVE(grp, il_list); - epoch_call(net_epoch_preempt, &grp->il_epoch_ctx, - in_pcblbgroup_free_deferred); + NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size); if (grp == NULL) return (NULL); KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, ("invalid new local group size %d and old local group count %d", grp->il_inpsiz, old_grp->il_inpcnt)); for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; in_pcblbgroup_free(old_grp); return (grp); } /* * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] * and shrink group if possible. */ static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i) { struct inpcblbgroup *grp, *new_grp; grp = *grpp; for (; i + 1 < grp->il_inpcnt; ++i) grp->il_inp[i] = grp->il_inp[i + 1]; grp->il_inpcnt--; if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && grp->il_inpcnt <= grp->il_inpsiz / 4) { /* Shrink this group. */ new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); if (new_grp != NULL) *grpp = new_grp; } } /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int in_pcbinslbgrouphash(struct inpcb *inp) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Don't allow jailed socket to join local group. */ if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) return (0); #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. */ if ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr == INADDR_ANY && INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { return (0); } #endif idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) break; } if (grp == NULL) { /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", ntohs(grp->il_lport)); return (0); } /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) return (ENOBUFS); } KASSERT(grp->il_inpcnt < grp->il_inpsiz, ("invalid local group size %d and count %d", grp->il_inpsiz, grp->il_inpcnt)); grp->il_inp[grp->il_inpcnt] = inp; grp->il_inpcnt++; return (0); } /* * Remove PCB from load balance group. */ static void in_pcbremlbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int i; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); } return; } } } /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) { porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; CK_LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { NET_EPOCH_ASSERT(); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); #ifdef NUMA inp->inp_numa_domain = M_NODOM; #endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ int inp_so_options(const struct inpcb *inp) { int so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) so_options |= SO_REUSEPORT_LB; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) || (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || ((reuseport & tw->tw_so_options) == 0 && (reuseport_lb & tw->tw_so_options) == 0)) { return (EADDRINUSE); } } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m, bool rehash) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { KASSERT(rehash == true, ("Rehashing required for unbound inps")); inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; if (rehash) { in_pcbrehash_mbuf(inp, m); } else { in_pcbinshash_mbuf(inp, m); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; struct epoch_tracker et; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ NET_EPOCH_ENTER(et); if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); } if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: NET_EPOCH_EXIT(et); if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } void in_pcblist_rele_rlocked(epoch_context_t ctx) { struct in_pcblist *il; struct inpcb *inp; struct inpcbinfo *pcbinfo; int i, n; il = __containerof(ctx, struct in_pcblist, il_epoch_ctx); pcbinfo = il->il_pcbinfo; n = il->il_count; INP_INFO_WLOCK(pcbinfo); for (i = 0; i < n; i++) { inp = il->il_inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); free(il, M_TEMP); } static void inpcbport_free(epoch_context_t ctx) { struct inpcbport *phd; phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); free(phd, M_PCB); } static void in_pcbfree_deferred(epoch_context_t ctx) { struct inpcb *inp; int released __unused; inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); INP_WLOCK(inp); CURVNET_SET(inp->inp_vnet); #ifdef INET struct ip_moptions *imo = inp->inp_moptions; inp->inp_moptions = NULL; #endif /* XXXRW: Do as much as possible here. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif #ifdef INET6 struct ip6_moptions *im6o = NULL; if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); im6o = inp->in6p_moptions; inp->in6p_moptions = NULL; } #endif if (inp->inp_options) (void)m_free(inp->inp_options); inp->inp_vflag = 0; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif released = in_pcbrele_wlocked(inp); MPASS(released); #ifdef INET6 ip6_freemoptions(im6o); #endif #ifdef INET inp_freemoptions(imo); #endif CURVNET_RESTORE(); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return; } #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_LOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK(pcbinfo); in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); RO_INVALIDATE_CACHE(&inp->inp_route); /* mark as destruction in progress */ inp->inp_flags2 |= INP_FREED; INP_WUNLOCK(inp); - epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred); + NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); #ifdef INVARIANTS if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) MPASS(inp->inp_refcount > 1); #endif /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); - epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); + NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct in_multi *inm; struct in_mfilter *imf; struct ip_moptions *imo; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ restart: IP_MFILTER_FOREACH(imf, &imo->imo_head) { if ((inm = imf->imf_inm) == NULL) continue; if (inm->inm_ifp != ifp) continue; ip_mfilter_remove(&imo->imo_head, imf); IN_MULTI_LOCK_ASSERT(); in_leavegroup_locked(inm, NULL); ip_mfilter_free(imf); goto restart; } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, uint16_t fport, int lookupflags) { struct inpcb *local_wild; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Order of socket selection: * 1. non-wild. * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). * * NOTE: * - Load balanced group does not contain jailed sockets * - Load balanced group does not contain IPv4 mapped INET6 wild sockets */ local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) continue; #endif if (grp->il_lport != lport) continue; idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % grp->il_inpcnt; if (grp->il_laddr.s_addr == laddr->s_addr) return (grp->il_inp[idx]); if (grp->il_laddr.s_addr == INADDR_ANY && (lookupflags & INPLOOKUP_WILDCARD) != 0) local_wild = grp->il_inp[idx]; } return (local_wild); } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (NULL); } else if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look in lb group (for wildcard match). */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags); if (inp != NULL) return (inp); } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_WUNLOCK(inp); inp = NULL; } } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_RUNLOCK(inp); inp = NULL; } } else panic("%s: locking bug", __func__); #ifdef INVARIANTS if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); } #endif } return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Add entry to load balance group. * Only do this if SO_REUSEPORT_LB is set. */ so_options = inp_so_options(inp); if (so_options & SO_REUSEPORT_LB) { int ret = in_pcbinslbgrouphash(inp); if (ret) { /* pcb lb group malloc fail (ret=ENOBUFS). */ return (ret); } } /* * Go through port list and look for a head for this lport. */ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (m != NULL) { in_pcbgroup_update_mbuf(inp, m); } else { in_pcbgroup_update(inp); } #endif return (0); } int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, NULL)); } int in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m) { return (in_pcbinshash_internal(inp, m)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; CK_LIST_REMOVE(inp, inp_hash); CK_LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { NET_EPOCH_ASSERT(); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); /* XXX: Only do if SO_REUSEPORT_LB set? */ in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); - epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); + NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { bzero(xi, sizeof(*xi)); xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) return (EOPNOTSUPP); error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); if (*st != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); #ifdef INET if (error == 0) { counter_u64_add(rate_limit_set_ok, 1); counter_u64_add(rate_limit_active, 1); } else counter_u64_add(rate_limit_alloc_fail, 1); #endif } return (error); } void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst) { if (ifp == NULL) return; /* * If the device was detached while we still had reference(s) * on the ifp, we assume if_snd_tag_free() was replaced with * stubs. */ ifp->if_snd_tag_free(mst); /* release reference count on network interface */ if_rele(ifp); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; m_snd_tag_rele(mst); } int in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) { int error; /* * If the existing send tag is for the wrong interface due to * a route change, first drop the existing tag. Set the * CHANGED flag so that we will keep trying to allocate a new * tag if we fail to allocate one this time. */ if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { in_pcbdetach_txrtlmt(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; } /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; return (error); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; int error; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { bool did_upgrade; if (inp == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #ifdef INET static void rl_init(void *st) { rate_limit_active = counter_u64_alloc(M_WAITOK); rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); rate_limit_set_ok = counter_u64_alloc(M_WAITOK); } SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); #endif #endif /* RATELIMIT */ Index: head/sys/netinet/ip_gre.c =================================================================== --- head/sys/netinet/ip_gre.c (revision 356754) +++ head/sys/netinet/ip_gre.c (revision 356755) @@ -1,595 +1,594 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Heiko W.Rupp * * IPv6-over-GRE contributed by Gert Doering * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #define GRE_TTL 30 VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; #define V_ip_gre_ttl VNET(ip_gre_ttl) SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_gre_ttl), 0, "Default TTL value for encapsulated packets"); struct in_gre_socket { struct gre_socket base; in_addr_t addr; }; VNET_DEFINE_STATIC(struct gre_sockets *, ipv4_sockets) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv4_hashtbl) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv4_srchashtbl) = NULL; #define V_ipv4_sockets VNET(ipv4_sockets) #define V_ipv4_hashtbl VNET(ipv4_hashtbl) #define V_ipv4_srchashtbl VNET(ipv4_srchashtbl) #define GRE_HASH(src, dst) (V_ipv4_hashtbl[\ in_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) #define GRE_SRCHASH(src) (V_ipv4_srchashtbl[\ fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) #define GRE_SOCKHASH(src) (V_ipv4_sockets[\ fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) #define GRE_HASH_SC(sc) GRE_HASH((sc)->gre_oip.ip_src.s_addr,\ (sc)->gre_oip.ip_dst.s_addr) static uint32_t in_gre_hashval(in_addr_t src, in_addr_t dst) { uint32_t ret; ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); return (fnv_32_buf(&dst, sizeof(dst), ret)); } static struct gre_socket* in_gre_lookup_socket(in_addr_t addr) { struct gre_socket *gs; struct in_gre_socket *s; CK_LIST_FOREACH(gs, &GRE_SOCKHASH(addr), chain) { s = __containerof(gs, struct in_gre_socket, base); if (s->addr == addr) break; } return (gs); } static int in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst, uint32_t opts) { struct gre_list *head; struct gre_softc *tmp; struct gre_socket *gs; if (sc->gre_family == AF_INET && sc->gre_oip.ip_src.s_addr == src && sc->gre_oip.ip_dst.s_addr == dst && (sc->gre_options & GRE_UDPENCAP) == (opts & GRE_UDPENCAP)) return (EEXIST); if (opts & GRE_UDPENCAP) { gs = in_gre_lookup_socket(src); if (gs == NULL) return (0); head = &gs->list; } else head = &GRE_HASH(src, dst); CK_LIST_FOREACH(tmp, head, chain) { if (tmp == sc) continue; if (tmp->gre_oip.ip_src.s_addr == src && tmp->gre_oip.ip_dst.s_addr == dst) return (EADDRNOTAVAIL); } return (0); } static int in_gre_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct gre_softc *sc; if (V_ipv4_hashtbl == NULL) return (0); NET_EPOCH_ASSERT(); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &GRE_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { /* * This is an inbound packet, its ip_dst is source address * in softc. */ if (sc->gre_oip.ip_src.s_addr == ip->ip_dst.s_addr && sc->gre_oip.ip_dst.s_addr == ip->ip_src.s_addr) { if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } /* * Check that ingress address belongs to local host. */ static void in_gre_set_running(struct gre_softc *sc) { if (in_localip(sc->gre_oip.ip_src)) GRE2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; else GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * ifaddr_event handler. * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent * source address spoofing. */ static void in_gre_srcaddr(void *arg __unused, const struct sockaddr *sa, int event __unused) { const struct sockaddr_in *sin; struct gre_softc *sc; /* Check that VNET is ready */ if (V_ipv4_hashtbl == NULL) return; NET_EPOCH_ASSERT(); sin = (const struct sockaddr_in *)sa; CK_LIST_FOREACH(sc, &GRE_SRCHASH(sin->sin_addr.s_addr), srchash) { if (sc->gre_oip.ip_src.s_addr != sin->sin_addr.s_addr) continue; in_gre_set_running(sc); } } static void in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct epoch_tracker et; struct gre_socket *gs; struct gre_softc *sc; in_addr_t dst; NET_EPOCH_ENTER(et); /* * udp_append() holds reference to inp, it is safe to check * inp_flags2 without INP_RLOCK(). * If socket was closed before we have entered NET_EPOCH section, * INP_FREED flag should be set. Otherwise it should be safe to * make access to ctx data, because gre_so will be freed by - * gre_sofree() via epoch_call(). + * gre_sofree() via NET_EPOCH_CALL(). */ if (__predict_false(inp->inp_flags2 & INP_FREED)) { NET_EPOCH_EXIT(et); m_freem(m); return; } gs = (struct gre_socket *)ctx; dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr; CK_LIST_FOREACH(sc, &gs->list, chain) { if (sc->gre_oip.ip_dst.s_addr == dst) break; } if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); NET_EPOCH_EXIT(et); return; } m_freem(m); NET_EPOCH_EXIT(et); } static int in_gre_setup_socket(struct gre_softc *sc) { struct sockopt sopt; struct sockaddr_in sin; struct in_gre_socket *s; struct gre_socket *gs; in_addr_t addr; int error, value; /* * NOTE: we are protected with gre_ioctl_sx lock. * * First check that socket is already configured. * If so, check that source addres was not changed. * If address is different, check that there are no other tunnels * and close socket. */ addr = sc->gre_oip.ip_src.s_addr; gs = sc->gre_so; if (gs != NULL) { s = __containerof(gs, struct in_gre_socket, base); if (s->addr != addr) { if (CK_LIST_EMPTY(&gs->list)) { CK_LIST_REMOVE(gs, chain); soclose(gs->so); - epoch_call(net_epoch_preempt, &gs->epoch_ctx, - gre_sofree); + NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx); } gs = sc->gre_so = NULL; } } if (gs == NULL) { /* * Check that socket for given address is already * configured. */ gs = in_gre_lookup_socket(addr); if (gs == NULL) { s = malloc(sizeof(*s), M_GRE, M_WAITOK | M_ZERO); s->addr = addr; gs = &s->base; error = socreate(sc->gre_family, &gs->so, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread); if (error != 0) { if_printf(GRE2IFP(sc), "cannot create socket: %d\n", error); free(s, M_GRE); return (error); } error = udp_set_kernel_tunneling(gs->so, in_gre_udp_input, NULL, gs); if (error != 0) { if_printf(GRE2IFP(sc), "cannot set UDP tunneling: %d\n", error); goto fail; } memset(&sopt, 0, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IP; sopt.sopt_name = IP_BINDANY; sopt.sopt_val = &value; sopt.sopt_valsize = sizeof(value); value = 1; error = sosetopt(gs->so, &sopt); if (error != 0) { if_printf(GRE2IFP(sc), "cannot set IP_BINDANY opt: %d\n", error); goto fail; } memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr.s_addr = addr; sin.sin_port = htons(GRE_UDPPORT); error = sobind(gs->so, (struct sockaddr *)&sin, curthread); if (error != 0) { if_printf(GRE2IFP(sc), "cannot bind socket: %d\n", error); goto fail; } /* Add socket to the chain */ CK_LIST_INSERT_HEAD(&GRE_SOCKHASH(addr), gs, chain); } } /* Add softc to the socket's list */ CK_LIST_INSERT_HEAD(&gs->list, sc, chain); sc->gre_so = gs; return (0); fail: soclose(gs->so); free(s, M_GRE); return (error); } static int in_gre_attach(struct gre_softc *sc) { struct grehdr *gh; int error; if (sc->gre_options & GRE_UDPENCAP) { sc->gre_csumflags = CSUM_UDP; sc->gre_hlen = sizeof(struct greudp); sc->gre_oip.ip_p = IPPROTO_UDP; gh = &sc->gre_udphdr->gi_gre; gre_update_udphdr(sc, &sc->gre_udp, in_pseudo(sc->gre_oip.ip_src.s_addr, sc->gre_oip.ip_dst.s_addr, 0)); } else { sc->gre_hlen = sizeof(struct greip); sc->gre_oip.ip_p = IPPROTO_GRE; gh = &sc->gre_iphdr->gi_gre; } sc->gre_oip.ip_v = IPVERSION; sc->gre_oip.ip_hl = sizeof(struct ip) >> 2; gre_update_hdr(sc, gh); /* * If we return error, this means that sc is not linked, * and caller should reset gre_family and free(sc->gre_hdr). */ if (sc->gre_options & GRE_UDPENCAP) { error = in_gre_setup_socket(sc); if (error != 0) return (error); } else CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); CK_LIST_INSERT_HEAD(&GRE_SRCHASH(sc->gre_oip.ip_src.s_addr), sc, srchash); /* Set IFF_DRV_RUNNING if interface is ready */ in_gre_set_running(sc); return (0); } int in_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) { int error; /* NOTE: we are protected with gre_ioctl_sx lock */ MPASS(cmd == GRESKEY || cmd == GRESOPTS || cmd == GRESPORT); MPASS(sc->gre_family == AF_INET); /* * If we are going to change encapsulation protocol, do check * for duplicate tunnels. Return EEXIST here to do not confuse * user. */ if (cmd == GRESOPTS && (sc->gre_options & GRE_UDPENCAP) != (value & GRE_UDPENCAP) && in_gre_checkdup(sc, sc->gre_oip.ip_src.s_addr, sc->gre_oip.ip_dst.s_addr, value) == EADDRNOTAVAIL) return (EEXIST); CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); switch (cmd) { case GRESKEY: sc->gre_key = value; break; case GRESOPTS: sc->gre_options = value; break; case GRESPORT: sc->gre_port = value; break; } error = in_gre_attach(sc); if (error != 0) { sc->gre_family = 0; free(sc->gre_hdr, M_GRE); } return (error); } int in_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *dst, *src; struct ip *ip; int error; /* NOTE: we are protected with gre_ioctl_sx lock */ error = EINVAL; switch (cmd) { case SIOCSIFPHYADDR: src = &((struct in_aliasreq *)data)->ifra_addr; dst = &((struct in_aliasreq *)data)->ifra_dstaddr; /* sanity checks */ if (src->sin_family != dst->sin_family || src->sin_family != AF_INET || src->sin_len != dst->sin_len || src->sin_len != sizeof(*src)) break; if (src->sin_addr.s_addr == INADDR_ANY || dst->sin_addr.s_addr == INADDR_ANY) { error = EADDRNOTAVAIL; break; } if (V_ipv4_hashtbl == NULL) { V_ipv4_hashtbl = gre_hashinit(); V_ipv4_srchashtbl = gre_hashinit(); V_ipv4_sockets = (struct gre_sockets *)gre_hashinit(); } error = in_gre_checkdup(sc, src->sin_addr.s_addr, dst->sin_addr.s_addr, sc->gre_options); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { /* Addresses are the same. Just return. */ error = 0; break; } ip = malloc(sizeof(struct greudp) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip->ip_src.s_addr = src->sin_addr.s_addr; ip->ip_dst.s_addr = dst->sin_addr.s_addr; if (sc->gre_family != 0) { /* Detach existing tunnel first */ CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); free(sc->gre_hdr, M_GRE); /* XXX: should we notify about link state change? */ } sc->gre_family = AF_INET; sc->gre_hdr = ip; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; error = in_gre_attach(sc); if (error != 0) { sc->gre_family = 0; free(sc->gre_hdr, M_GRE); } break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (sc->gre_family != AF_INET) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); src->sin_addr = (cmd == SIOCGIFPSRCADDR) ? sc->gre_oip.ip_src: sc->gre_oip.ip_dst; error = prison_if(curthread->td_ucred, (struct sockaddr *)src); if (error != 0) memset(src, 0, sizeof(*src)); break; } return (error); } int in_gre_output(struct mbuf *m, int af, int hlen) { struct greip *gi; gi = mtod(m, struct greip *); switch (af) { case AF_INET: /* * gre_transmit() has used M_PREPEND() that doesn't guarantee * m_data is contiguous more than hlen bytes. Use m_copydata() * here to avoid m_pullup(). */ m_copydata(m, hlen + offsetof(struct ip, ip_tos), sizeof(u_char), &gi->gi_ip.ip_tos); m_copydata(m, hlen + offsetof(struct ip, ip_id), sizeof(u_short), (caddr_t)&gi->gi_ip.ip_id); break; #ifdef INET6 case AF_INET6: gi->gi_ip.ip_tos = 0; /* XXX */ ip_fillid(&gi->gi_ip); break; #endif } gi->gi_ip.ip_ttl = V_ip_gre_ttl; gi->gi_ip.ip_len = htons(m->m_pkthdr.len); return (ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL)); } static const struct srcaddrtab *ipv4_srcaddrtab = NULL; static const struct encaptab *ecookie = NULL; static const struct encap_config ipv4_encap_cfg = { .proto = IPPROTO_GRE, .min_length = sizeof(struct greip) + sizeof(struct ip), .exact_match = ENCAP_DRV_LOOKUP, .lookup = in_gre_lookup, .input = gre_input }; void in_gre_init(void) { if (!IS_DEFAULT_VNET(curvnet)) return; ipv4_srcaddrtab = ip_encap_register_srcaddr(in_gre_srcaddr, NULL, M_WAITOK); ecookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); } void in_gre_uninit(void) { if (IS_DEFAULT_VNET(curvnet)) { ip_encap_detach(ecookie); ip_encap_unregister_srcaddr(ipv4_srcaddrtab); } if (V_ipv4_hashtbl != NULL) { gre_hashdestroy(V_ipv4_hashtbl); V_ipv4_hashtbl = NULL; GRE_WAIT(); gre_hashdestroy(V_ipv4_srchashtbl); gre_hashdestroy((struct gre_list *)V_ipv4_sockets); } } Index: head/sys/netinet/tcp_ratelimit.c =================================================================== --- head/sys/netinet/tcp_ratelimit.c (revision 356754) +++ head/sys/netinet/tcp_ratelimit.c (revision 356755) @@ -1,1223 +1,1223 @@ /*- * * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2018-2019 * Netflix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /** * Author: Randall Stewart */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #ifdef INET6 #include #endif #include #ifndef USECS_IN_SECOND #define USECS_IN_SECOND 1000000 #endif /* * For the purposes of each send, what is the size * of an ethernet frame. */ #ifndef ETHERNET_SEGMENT_SIZE #define ETHERNET_SEGMENT_SIZE 1500 #endif MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); #ifdef RATELIMIT #define COMMON_RATE 180500 uint64_t desired_rates[] = { 62500, /* 500Kbps */ 180500, /* 1.44Mpbs */ 375000, /* 3Mbps */ 500000, /* 4Mbps */ 625000, /* 5Mbps */ 750000, /* 6Mbps */ 1000000, /* 8Mbps */ 1250000, /* 10Mbps */ 2500000, /* 20Mbps */ 3750000, /* 30Mbps */ 5000000, /* 40Meg */ 6250000, /* 50Mbps */ 12500000, /* 100Mbps */ 25000000, /* 200Mbps */ 50000000, /* 400Mbps */ 100000000, /* 800Mbps */ 12500, /* 100kbps */ 25000, /* 200kbps */ 875000, /* 7Mbps */ 1125000, /* 9Mbps */ 1875000, /* 15Mbps */ 3125000, /* 25Mbps */ 8125000, /* 65Mbps */ 10000000, /* 80Mbps */ 18750000, /* 150Mbps */ 20000000, /* 250Mbps */ 37500000, /* 350Mbps */ 62500000, /* 500Mbps */ 78125000, /* 625Mbps */ 125000000, /* 1Gbps */ }; #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) #define RS_ORDERED_COUNT 16 /* * Number that are in order * at the beginning of the table, * over this a sort is required. */ #define RS_NEXT_ORDER_GROUP 16 /* * The point in our table where * we come fill in a second ordered * group (index wise means -1). */ #define ALL_HARDWARE_RATES 1004 /* * 1Meg - 1Gig in 1 Meg steps * plus 100, 200k and 500k and * 10Gig */ #define RS_ONE_MEGABIT_PERSEC 1000000 #define RS_ONE_GIGABIT_PERSEC 1000000000 #define RS_TEN_GIGABIT_PERSEC 10000000000 static struct head_tcp_rate_set int_rs; static struct mtx rs_mtx; uint32_t rs_number_alive; uint32_t rs_number_dead; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0, "TCP Ratelimit stats"); SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, &rs_number_alive, 0, "Number of interfaces initialized for ratelimiting"); SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, &rs_number_dead, 0, "Number of interfaces departing from ratelimiting"); static void rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) { /* * Add sysctl entries for thus interface. */ if (rs->rs_flags & RS_INTF_NO_SUP) { SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "disable", CTLFLAG_RD, &rs->rs_disable, 0, "Disable this interface from new hdwr limiting?"); } else { SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "disable", CTLFLAG_RW, &rs->rs_disable, 0, "Disable this interface from new hdwr limiting?"); } SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "minseg", CTLFLAG_RW, &rs->rs_min_seg, 0, "What is the minimum we need to send on this interface?"); SYSCTL_ADD_U64(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "flow_limit", CTLFLAG_RW, &rs->rs_flow_limit, 0, "What is the limit for number of flows (0=unlimited)?"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "highest", CTLFLAG_RD, &rs->rs_highest_valid, 0, "Highest valid rate"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "lowest", CTLFLAG_RD, &rs->rs_lowest_valid, 0, "Lowest valid rate"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "flags", CTLFLAG_RD, &rs->rs_flags, 0, "What lags are on the entry?"); SYSCTL_ADD_S32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "numrates", CTLFLAG_RD, &rs->rs_rate_cnt, 0, "How many rates re there?"); SYSCTL_ADD_U64(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "flows_using", CTLFLAG_RD, &rs->rs_flows_using, 0, "How many flows are using this interface now?"); #ifdef DETAILED_RATELIMIT_SYSCTL if (rs->rs_rlt && rs->rs_rate_cnt > 0) { /* Lets display the rates */ int i; struct sysctl_oid *rl_rates; struct sysctl_oid *rl_rate_num; char rate_num[16]; rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_sysctl_root), OID_AUTO, "rate", CTLFLAG_RW, 0, "Ratelist"); for( i = 0; i < rs->rs_rate_cnt; i++) { sprintf(rate_num, "%d", i); rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rates), OID_AUTO, rate_num, CTLFLAG_RW, 0, "Individual Rate"); SYSCTL_ADD_U32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rate_num), OID_AUTO, "flags", CTLFLAG_RD, &rs->rs_rlt[i].flags, 0, "Flags on this rate"); SYSCTL_ADD_U32(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rate_num), OID_AUTO, "pacetime", CTLFLAG_RD, &rs->rs_rlt[i].time_between, 0, "Time hardware inserts between 1500 byte sends"); SYSCTL_ADD_U64(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rate_num), OID_AUTO, "rate", CTLFLAG_RD, &rs->rs_rlt[i].rate, 0, "Rate in bytes per second"); } } #endif } static void rs_destroy(epoch_context_t ctx) { struct tcp_rate_set *rs; bool do_free_rs; rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); mtx_lock(&rs_mtx); rs->rs_flags &= ~RS_FUNERAL_SCHD; /* * In theory its possible (but unlikely) * that while the delete was occuring * and we were applying the DEAD flag * someone slipped in and found the * interface in a lookup. While we * decided rs_flows_using were 0 and * scheduling the epoch_call, the other * thread incremented rs_flow_using. This * is because users have a pointer and * we only use the rs_flows_using in an * atomic fashion, i.e. the other entities * are not protected. To assure this did * not occur, we check rs_flows_using here * before deleting. */ do_free_rs = (rs->rs_flows_using == 0); rs_number_dead--; mtx_unlock(&rs_mtx); if (do_free_rs) { sysctl_ctx_free(&rs->sysctl_ctx); free(rs->rs_rlt, M_TCPPACE); free(rs, M_TCPPACE); } } static void rs_defer_destroy(struct tcp_rate_set *rs) { mtx_assert(&rs_mtx, MA_OWNED); /* Check if already pending. */ if (rs->rs_flags & RS_FUNERAL_SCHD) return; rs_number_dead++; /* Set flag to only defer once. */ rs->rs_flags |= RS_FUNERAL_SCHD; - epoch_call(net_epoch_preempt, &rs->rs_epoch_ctx, rs_destroy); + NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); } #ifdef INET extern counter_u64_t rate_limit_set_ok; extern counter_u64_t rate_limit_active; extern counter_u64_t rate_limit_alloc_fail; #endif static int rl_attach_txrtlmt(struct ifnet *ifp, uint32_t flowtype, int flowid, uint64_t cfg_rate, struct m_snd_tag **tag) { int error; union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = cfg_rate, .rate_limit.flags = M_NOWAIT, }; if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag); #ifdef INET if (error == 0) { if_ref((*tag)->ifp); counter_u64_add(rate_limit_set_ok, 1); counter_u64_add(rate_limit_active, 1); } else counter_u64_add(rate_limit_alloc_fail, 1); #endif } return (error); } static void populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) { /* * The internal table is "special", it * is two seperate ordered tables that * must be merged. We get here when the * adapter specifies a number of rates that * covers both ranges in the table in some * form. */ int i, at_low, at_high; uint8_t low_disabled = 0, high_disabled = 0; for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { rs->rs_rlt[i].flags = 0; rs->rs_rlt[i].time_between = 0; if ((low_disabled == 0) && (high_disabled || (rate_table_act[at_low] < rate_table_act[at_high]))) { rs->rs_rlt[i].rate = rate_table_act[at_low]; at_low++; if (at_low == RS_NEXT_ORDER_GROUP) low_disabled = 1; } else if (high_disabled == 0) { rs->rs_rlt[i].rate = rate_table_act[at_high]; at_high++; if (at_high == MAX_HDWR_RATES) high_disabled = 1; } } } static struct tcp_rate_set * rt_setup_new_rs(struct ifnet *ifp, int *error) { struct tcp_rate_set *rs; const uint64_t *rate_table_act; uint64_t lentim, res; size_t sz; uint32_t hash_type; int i; struct if_ratelimit_query_results rl; struct sysctl_oid *rl_sysctl_root; /* * We expect to enter with the * mutex locked. */ if (ifp->if_ratelimit_query == NULL) { /* * We can do nothing if we cannot * get a query back from the driver. */ return (NULL); } rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); if (rs == NULL) { if (error) *error = ENOMEM; return (NULL); } rl.flags = RT_NOSUPPORT; ifp->if_ratelimit_query(ifp, &rl); if (rl.flags & RT_IS_UNUSABLE) { /* * The interface does not really support * the rate-limiting. */ memset(rs, 0, sizeof(struct tcp_rate_set)); rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_flags = RS_INTF_NO_SUP; rs->rs_disable = 1; rs_number_alive++; sysctl_ctx_init(&rs->sysctl_ctx); rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), OID_AUTO, rs->rs_ifp->if_xname, CTLFLAG_RW, 0, ""); rl_add_syctl_entries(rl_sysctl_root, rs); mtx_lock(&rs_mtx); CK_LIST_INSERT_HEAD(&int_rs, rs, next); mtx_unlock(&rs_mtx); return (rs); } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { memset(rs, 0, sizeof(struct tcp_rate_set)); rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_flags = RS_IS_DEFF; rs_number_alive++; sysctl_ctx_init(&rs->sysctl_ctx); rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), OID_AUTO, rs->rs_ifp->if_xname, CTLFLAG_RW, 0, ""); rl_add_syctl_entries(rl_sysctl_root, rs); mtx_lock(&rs_mtx); CK_LIST_INSERT_HEAD(&int_rs, rs, next); mtx_unlock(&rs_mtx); return (rs); } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { /* Mellanox most likely */ rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_rate_cnt = rl.number_of_rates; rs->rs_min_seg = rl.min_segment_burst; rs->rs_highest_valid = 0; rs->rs_flow_limit = rl.max_flows; rs->rs_flags = RS_IS_INTF | RS_NO_PRE; rs->rs_disable = 0; rate_table_act = rl.rate_table; } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { /* Chelsio */ rs->rs_ifp = ifp; rs->rs_if_dunit = ifp->if_dunit; rs->rs_rate_cnt = rl.number_of_rates; rs->rs_min_seg = rl.min_segment_burst; rs->rs_disable = 0; rs->rs_flow_limit = rl.max_flows; rate_table_act = desired_rates; if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { /* * Our desired table is not big * enough, do what we can. */ rs->rs_rate_cnt = MAX_HDWR_RATES; } if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) rs->rs_flags = RS_IS_INTF; else rs->rs_flags = RS_IS_INTF | RS_INT_TBL; if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) rs->rs_rate_cnt = ALL_HARDWARE_RATES; } else { printf("Interface:%s unit:%d not one known to have rate-limits\n", ifp->if_dname, ifp->if_dunit); free(rs, M_TCPPACE); return (NULL); } sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); if (rs->rs_rlt == NULL) { if (error) *error = ENOMEM; bail: free(rs, M_TCPPACE); return (NULL); } if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { /* * The interface supports all * the rates we could possibly want. */ uint64_t rat; rs->rs_rlt[0].rate = 12500; /* 100k */ rs->rs_rlt[1].rate = 25000; /* 200k */ rs->rs_rlt[2].rate = 62500; /* 500k */ /* Note 125000 == 1Megabit * populate 1Meg - 1000meg. */ for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { rs->rs_rlt[i].rate = rat; rat += 125000; } rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; } else if (rs->rs_flags & RS_INT_TBL) { /* We populate this in a special way */ populate_canned_table(rs, rate_table_act); } else { /* * Just copy in the rates from * the table, it is in order. */ for (i=0; irs_rate_cnt; i++) { rs->rs_rlt[i].rate = rate_table_act[i]; rs->rs_rlt[i].time_between = 0; rs->rs_rlt[i].flags = 0; } } for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { /* * We go backwards through the list so that if we can't get * a rate and fail to init one, we have at least a chance of * getting the highest one. */ rs->rs_rlt[i].ptbl = rs; rs->rs_rlt[i].tag = NULL; /* * Calculate the time between. */ lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; res = lentim / rs->rs_rlt[i].rate; if (res > 0) rs->rs_rlt[i].time_between = res; else rs->rs_rlt[i].time_between = 1; if (rs->rs_flags & RS_NO_PRE) { rs->rs_rlt[i].flags = HDWRPACE_INITED; rs->rs_lowest_valid = i; } else { int err; #ifdef RSS hash_type = M_HASHTYPE_RSS_TCP_IPV4; #else hash_type = M_HASHTYPE_OPAQUE_HASH; #endif err = rl_attach_txrtlmt(ifp, hash_type, (i + 1), rs->rs_rlt[i].rate, &rs->rs_rlt[i].tag); if (err) { if (i == (rs->rs_rate_cnt - 1)) { /* * Huh - first rate and we can't get * it? */ free(rs->rs_rlt, M_TCPPACE); if (error) *error = err; goto bail; } else { if (error) *error = err; } break; } else { rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; rs->rs_lowest_valid = i; } } } /* Did we get at least 1 rate? */ if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) rs->rs_highest_valid = rs->rs_rate_cnt - 1; else { free(rs->rs_rlt, M_TCPPACE); goto bail; } rs_number_alive++; sysctl_ctx_init(&rs->sysctl_ctx); rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), OID_AUTO, rs->rs_ifp->if_xname, CTLFLAG_RW, 0, ""); rl_add_syctl_entries(rl_sysctl_root, rs); mtx_lock(&rs_mtx); CK_LIST_INSERT_HEAD(&int_rs, rs, next); mtx_unlock(&rs_mtx); return (rs); } static const struct tcp_hwrate_limit_table * tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) { struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; uint64_t mbits_per_sec, ind_calc; int i; mbits_per_sec = (bytes_per_sec * 8); if (flags & RS_PACING_LT) { if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && (rs->rs_lowest_valid <= 2)){ /* * Smaller than 1Meg, only * 3 entries can match it. */ for(i = rs->rs_lowest_valid; i < 3; i++) { if (bytes_per_sec <= rs->rs_rlt[i].rate) { rte = &rs->rs_rlt[i]; break; } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { arte = &rs->rs_rlt[i]; } } goto done; } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ /* * Larger than 1G (the majority of * our table. */ if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; else arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; goto done; } /* * If we reach here its in our table (between 1Meg - 1000Meg), * just take the rounded down mbits per second, and add * 1Megabit to it, from this we can calculate * the index in the table. */ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) ind_calc++; /* our table is offset by 3, we add 2 */ ind_calc += 2; if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = ALL_HARDWARE_RATES-1; } if ((ind_calc >= rs->rs_lowest_valid) && (ind_calc <= rs->rs_highest_valid)) rte = &rs->rs_rlt[ind_calc]; } else if (flags & RS_PACING_EXACT_MATCH) { if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && (rs->rs_lowest_valid <= 2)){ for(i = rs->rs_lowest_valid; i < 3; i++) { if (bytes_per_sec == rs->rs_rlt[i].rate) { rte = &rs->rs_rlt[i]; break; } } } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { /* > 1Gbps only one rate */ if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { /* Its 10G wow */ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } } else { /* Ok it must be a exact meg (its between 1G and 1Meg) */ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { /* its an exact Mbps */ ind_calc += 2; if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = ALL_HARDWARE_RATES-1; } if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) rte = &rs->rs_rlt[ind_calc]; } } } else { /* we want greater than the requested rate */ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && (rs->rs_lowest_valid <= 2)){ arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ for (i=2; i>=rs->rs_lowest_valid; i--) { if (bytes_per_sec < rs->rs_rlt[i].rate) { rte = &rs->rs_rlt[i]; break; } else if ((flags & RS_PACING_GEQ) && (bytes_per_sec == rs->rs_rlt[i].rate)) { rte = &rs->rs_rlt[i]; break; } else { arte = &rs->rs_rlt[i]; /* new alternate */ } } } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ /* Our top rate is larger than the request */ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } else if ((flags & RS_PACING_GEQ) && (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { /* It matches our top rate */ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { /* The top rate is an alternative */ arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; } } else { /* Its in our range 1Meg - 1Gig */ if (flags & RS_PACING_GEQ) { ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = (ALL_HARDWARE_RATES-1); } rte = &rs->rs_rlt[ind_calc]; } goto done; } ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; ind_calc += 2; if (ind_calc > (ALL_HARDWARE_RATES-1)) { /* This should not happen */ ind_calc = ALL_HARDWARE_RATES-1; } if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) rte = &rs->rs_rlt[ind_calc]; } } done: if ((rte == NULL) && (arte != NULL) && (flags & RS_PACING_SUB_OK)) { /* We can use the substitute */ rte = arte; } return (rte); } static const struct tcp_hwrate_limit_table * tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) { /** * Hunt the rate table with the restrictions in flags and find a * suitable rate if possible. * RS_PACING_EXACT_MATCH - look for an exact match to rate. * RS_PACING_GT - must be greater than. * RS_PACING_GEQ - must be greater than or equal. * RS_PACING_LT - must be less than. * RS_PACING_SUB_OK - If we don't meet criteria a * substitute is ok. */ int i, matched; struct tcp_hwrate_limit_table *rte = NULL; if ((rs->rs_flags & RS_INT_TBL) && (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { /* * Here we don't want to paw thru * a big table, we have everything * from 1Meg - 1000Meg in 1Meg increments. * Use an alternate method to "lookup". */ return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); } if ((flags & RS_PACING_LT) || (flags & RS_PACING_EXACT_MATCH)) { /* * For exact and less than we go forward through the table. * This way when we find one larger we stop (exact was a * toss up). */ for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { if ((flags & RS_PACING_EXACT_MATCH) && (bytes_per_sec == rs->rs_rlt[i].rate)) { rte = &rs->rs_rlt[i]; matched = 1; break; } else if ((flags & RS_PACING_LT) && (bytes_per_sec <= rs->rs_rlt[i].rate)) { rte = &rs->rs_rlt[i]; matched = 1; break; } if (bytes_per_sec > rs->rs_rlt[i].rate) break; } if ((matched == 0) && (flags & RS_PACING_LT) && (flags & RS_PACING_SUB_OK)) { /* Kick in a substitute (the lowest) */ rte = &rs->rs_rlt[rs->rs_lowest_valid]; } } else { /* * Here we go backward through the table so that we can find * the one greater in theory faster (but its probably a * wash). */ for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { if (rs->rs_rlt[i].rate > bytes_per_sec) { /* A possible candidate */ rte = &rs->rs_rlt[i]; } if ((flags & RS_PACING_GEQ) && (bytes_per_sec == rs->rs_rlt[i].rate)) { /* An exact match and we want equal */ matched = 1; rte = &rs->rs_rlt[i]; break; } else if (rte) { /* * Found one that is larger than but don't * stop, there may be a more closer match. */ matched = 1; } if (rs->rs_rlt[i].rate < bytes_per_sec) { /* * We found a table entry that is smaller, * stop there will be none greater or equal. */ break; } } if ((matched == 0) && (flags & RS_PACING_SUB_OK)) { /* Kick in a substitute (the highest) */ rte = &rs->rs_rlt[rs->rs_highest_valid]; } } return (rte); } static struct ifnet * rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) { struct ifnet *tifp; struct m_snd_tag *tag; union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = 1, .rate_limit.max_rate = COMMON_RATE, .rate_limit.flags = M_NOWAIT, }; int err; #ifdef RSS params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); #else params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; #endif tag = NULL; if (ifp->if_snd_tag_alloc) { if (error) *error = ENODEV; return (NULL); } err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag); if (err) { /* Failed to setup a tag? */ if (error) *error = err; return (NULL); } tifp = tag->ifp; tifp->if_snd_tag_free(tag); return (tifp); } static const struct tcp_hwrate_limit_table * rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, uint32_t flags, int *error) { /* First lets find the interface if it exists */ const struct tcp_hwrate_limit_table *rte; struct tcp_rate_set *rs; struct epoch_tracker et; int err; NET_EPOCH_ENTER(et); use_real_interface: CK_LIST_FOREACH(rs, &int_rs, next) { /* * Note we don't look with the lock since we either see a * new entry or will get one when we try to add it. */ if (rs->rs_flags & RS_IS_DEAD) { /* The dead are not looked at */ continue; } if ((rs->rs_ifp == ifp) && (rs->rs_if_dunit == ifp->if_dunit)) { /* Ok we found it */ break; } } if ((rs == NULL) || (rs->rs_flags & RS_INTF_NO_SUP) || (rs->rs_flags & RS_IS_DEAD)) { /* * This means we got a packet *before* * the IF-UP was processed below, * while or after we already received an interface * departed event. In either case we really don't * want to do anything with pacing, in * the departing case the packet is not * going to go very far. The new case * might be arguable, but its impossible * to tell from the departing case. */ if (rs->rs_disable && error) *error = ENODEV; epoch_exit_preempt(net_epoch_preempt, &et); return (NULL); } if ((rs == NULL) || (rs->rs_disable != 0)) { if (rs->rs_disable && error) *error = ENOSPC; NET_EPOCH_EXIT(et); return (NULL); } if (rs->rs_flags & RS_IS_DEFF) { /* We need to find the real interface */ struct ifnet *tifp; tifp = rt_find_real_interface(ifp, inp, error); if (tifp == NULL) { if (rs->rs_disable && error) *error = ENOTSUP; NET_EPOCH_EXIT(et); return (NULL); } goto use_real_interface; } if (rs->rs_flow_limit && ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { if (error) *error = ENOSPC; NET_EPOCH_EXIT(et); return (NULL); } rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); if (rte) { err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, inp->inp_flowtype, inp->inp_flowid, rte->rate, &inp->inp_snd_tag); if (err) { /* Failed to attach */ if (error) *error = err; rte = NULL; } } if (rte) { /* * We use an atomic here for accounting so we don't have to * use locks when freeing. */ atomic_add_64(&rs->rs_flows_using, 1); } NET_EPOCH_EXIT(et); return (rte); } static void tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) { int error; struct tcp_rate_set *rs; if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) || (link_state != LINK_STATE_UP)) { /* * We only care on an interface going up that is rate-limit * capable. */ return; } mtx_lock(&rs_mtx); CK_LIST_FOREACH(rs, &int_rs, next) { if ((rs->rs_ifp == ifp) && (rs->rs_if_dunit == ifp->if_dunit)) { /* We already have initialized this guy */ mtx_unlock(&rs_mtx); return; } } mtx_unlock(&rs_mtx); rt_setup_new_rs(ifp, &error); } static void tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) { struct tcp_rate_set *rs, *nrs; struct ifnet *tifp; int i; mtx_lock(&rs_mtx); CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { if ((rs->rs_ifp == ifp) && (rs->rs_if_dunit == ifp->if_dunit)) { CK_LIST_REMOVE(rs, next); rs_number_alive--; rs->rs_flags |= RS_IS_DEAD; for (i = 0; i < rs->rs_rate_cnt; i++) { if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { tifp = rs->rs_rlt[i].tag->ifp; in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); rs->rs_rlt[i].tag = NULL; } rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; } if (rs->rs_flows_using == 0) rs_defer_destroy(rs); break; } } mtx_unlock(&rs_mtx); } static void tcp_rl_shutdown(void *arg __unused, int howto __unused) { struct tcp_rate_set *rs, *nrs; struct ifnet *tifp; int i; mtx_lock(&rs_mtx); CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { CK_LIST_REMOVE(rs, next); rs_number_alive--; rs->rs_flags |= RS_IS_DEAD; for (i = 0; i < rs->rs_rate_cnt; i++) { if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { tifp = rs->rs_rlt[i].tag->ifp; in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); rs->rs_rlt[i].tag = NULL; } rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; } if (rs->rs_flows_using == 0) rs_defer_destroy(rs); } mtx_unlock(&rs_mtx); } const struct tcp_hwrate_limit_table * tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, uint64_t bytes_per_sec, int flags, int *error) { const struct tcp_hwrate_limit_table *rte; if (tp->t_inpcb->inp_snd_tag == NULL) { /* * We are setting up a rate for the first time. */ if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) { /* Not supported by the egress */ if (error) *error = ENODEV; return (NULL); } #ifdef KERN_TLS if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { /* * We currently can't do both TLS and hardware * pacing */ if (error) *error = EINVAL; return (NULL); } #endif rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); } else { /* * We are modifying a rate, wrong interface? */ if (error) *error = EINVAL; rte = NULL; } return (rte); } const struct tcp_hwrate_limit_table * tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp, struct ifnet *ifp, uint64_t bytes_per_sec, int flags, int *error) { const struct tcp_hwrate_limit_table *nrte; const struct tcp_rate_set *rs; int is_indirect = 0; int err; if ((tp->t_inpcb->inp_snd_tag == NULL) || (crte == NULL)) { /* Wrong interface */ if (error) *error = EINVAL; return (NULL); } rs = crte->ptbl; if ((rs->rs_flags & RS_IS_DEAD) || (crte->flags & HDWRPACE_IFPDEPARTED)) { /* Release the rate, and try anew */ re_rate: tcp_rel_pacing_rate(crte, tp); nrte = tcp_set_pacing_rate(tp, ifp, bytes_per_sec, flags, error); return (nrte); } if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) is_indirect = 1; else is_indirect = 0; if ((is_indirect == 0) && ((ifp != rs->rs_ifp) || (ifp->if_dunit != rs->rs_if_dunit))) { /* * Something changed, the user is not pointing to the same * ifp? Maybe a route updated on this guy? */ goto re_rate; } else if (is_indirect) { /* * For indirect we have to dig in and find the real interface. */ struct ifnet *rifp; rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); if (rifp == NULL) { /* Can't find it? */ goto re_rate; } if ((rifp != rs->rs_ifp) || (ifp->if_dunit != rs->rs_if_dunit)) { goto re_rate; } } nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); if (nrte == crte) { /* No change */ if (error) *error = 0; return (crte); } if (nrte == NULL) { /* Release the old rate */ tcp_rel_pacing_rate(crte, tp); return (NULL); } /* Change rates to our new entry */ err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); if (err) { if (error) *error = err; return (NULL); } if (error) *error = 0; return (nrte); } void tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) { const struct tcp_rate_set *crs; struct tcp_rate_set *rs; uint64_t pre; crs = crte->ptbl; /* * Now we must break the const * in order to release our refcount. */ rs = __DECONST(struct tcp_rate_set *, crs); pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); if (pre == 1) { mtx_lock(&rs_mtx); /* * Is it dead? */ if (rs->rs_flags & RS_IS_DEAD) rs_defer_destroy(rs); mtx_unlock(&rs_mtx); } in_pcbdetach_txrtlmt(tp->t_inpcb); } static eventhandler_tag rl_ifnet_departs; static eventhandler_tag rl_ifnet_arrives; static eventhandler_tag rl_shutdown_start; static void tcp_rs_init(void *st __unused) { CK_LIST_INIT(&int_rs); rs_number_alive = 0; rs_number_dead = 0;; mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, tcp_rl_ifnet_departure, NULL, EVENTHANDLER_PRI_ANY); rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, tcp_rl_ifnet_link, NULL, EVENTHANDLER_PRI_ANY); rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_rl_shutdown, NULL, SHUTDOWN_PRI_FIRST); printf("TCP_ratelimit: Is now initialized\n"); } SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); #endif Index: head/sys/netinet6/in6.c =================================================================== --- head/sys/netinet6/in6.c (revision 356754) +++ head/sys/netinet6/in6.c (revision 356755) @@ -1,2557 +1,2557 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * struct in6_ifreq and struct ifreq must be type punnable for common members * of ifr_ifru to allow accessors to be shared. */ _Static_assert(offsetof(struct in6_ifreq, ifr_ifru) == offsetof(struct ifreq, ifr_ifru), "struct in6_ifreq and struct ifreq are not type punnable"); VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct rt_addrinfo info; struct ifaddr *ifa; struct sockaddr_dl gateway; int fibnum; ifa = &ia->ia_ifa; /* * Prepare info data for the host route. * This code mimics one from ifa_maintain_loopback_route(). */ bzero(&info, sizeof(struct rt_addrinfo)); info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ifa->ifa_addr; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gateway; link_init_sdl(ifa->ifa_ifp, (struct sockaddr *)&gateway, ifa->ifa_ifp->if_type); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ia62ifa(ia)->ifa_ifp->if_fib; if (cmd == RTM_ADD) { rt_addrmsg(cmd, &ia->ia_ifa, fibnum); rt_routemsg_info(cmd, &info, fibnum); } else if (cmd == RTM_DELETE) { rt_routemsg_info(cmd, &info, fibnum); rt_addrmsg(cmd, &ia->ia_ifa, fibnum); } } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) { if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, true); ifa_free(&ia->ia_ifa); } if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa, false); goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_addrcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } nd6_prefix_rele(pr); /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr != NULL && pr->ndpr_addrcnt == 0) { ND6_WLOCK(); nd6_prefix_unlink(pr, NULL); ND6_WUNLOCK(); nd6_prefix_del(pr); } EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } static struct in6_multi_mship * in6_joingroup_legacy(struct ifnet *ifp, const struct in6_addr *mcaddr, int *errorp, int delay) { struct in6_multi_mship *imm; int error; imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT); if (imm == NULL) { *errorp = ENOBUFS; return (NULL); } delay = (delay * PR_FASTHZ) / hz; error = in6_joingroup(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); if (error) { *errorp = error; free(imm, M_IP6MADDR); return (NULL); } return (imm); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); CK_LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa, false); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); if (imm->i6mm_maddr != NULL) in6_leavegroup(imm->i6mm_maddr, NULL); free(imm, M_IP6MADDR); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6 ? RTF_HOST : 0)); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); CK_STAILQ_REMOVE(&V_in6_ifaddrhead, ia, in6_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } NET_EPOCH_EXIT(et); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto done; } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) goto done; ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } done: WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Invoking IPv6 network device address event may sleep"); ifa_ref(&ia->ia_ifa); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_ADD); ifa_free(&ia->ia_ifa); return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } return ((struct in6_ifaddr *)ifa); } /* * find the interface address corresponding to a given IPv6 address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct epoch_tracker et; struct ifaddr *ifa; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct epoch_tracker et; struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; NET_EPOCH_ASSERT(); in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) return (1); } return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ NET_EPOCH_ASSERT(); dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); return (besta); } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); return dep[1]; } return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } NET_EPOCH_EXIT(et); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (0); if ((ND_IFINFO(ifp)->flags & (ND6_IFF_IFDISABLED | ND6_IFF_NO_DAD)) != 0) return (0); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { struct epoch_tracker et; unsigned long maxmtu = 0; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } NET_EPOCH_EXIT(et); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_BRIDGE: /* bridge(4) only does Ethernet-like links */ case IFT_INFINIBAND: return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in6_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); - epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in6_lltable_destroy_lle_unlocked); + NET_EPOCH_CALL(in6_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6; struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; int error; char ip6buf[INET6_ADDRSTRLEN]; int fibnum; NET_EPOCH_ASSERT(); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); sin6 = (const struct sockaddr_in6 *)l3addr; in6_splitscope(&sin6->sin6_addr, &dst, &scopeid); fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib; error = fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6); if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; } return 0; } /* * Called by the datapath to indicate that the entry was used. */ static void in6_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; /* * Set the hit time so the callback function * can determine the remaining time before * transiting to the DELAY state. */ lle->lle_hittime = time_uptime; LLE_REQ_UNLOCK(lle); } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { - epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in6_lltable_destroy_lle_unlocked); + NET_EPOCH_CALL(in6_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } if (lle->la_expire != 0) ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; if (lle->ln_router != 0) ndpc.rtm.rtm_flags |= RTF_GATEWAY; ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked; /* Store state in rmx_weight value */ ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; llt->llt_mark_used = in6_lltable_mark_used; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return ifp->if_mtu; return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ifp, ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: head/sys/netinet6/ip6_gre.c =================================================================== --- head/sys/netinet6/ip6_gre.c (revision 356754) +++ head/sys/netinet6/ip6_gre.c (revision 356755) @@ -1,595 +1,594 @@ /*- * Copyright (c) 2014, 2018 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(int, ip6_gre_hlim) = IPV6_DEFHLIM; #define V_ip6_gre_hlim VNET(ip6_gre_hlim) SYSCTL_DECL(_net_inet6_ip6); SYSCTL_INT(_net_inet6_ip6, OID_AUTO, grehlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gre_hlim), 0, "Default hop limit for encapsulated packets"); struct in6_gre_socket { struct gre_socket base; struct in6_addr addr; /* scope zone id is embedded */ }; VNET_DEFINE_STATIC(struct gre_sockets *, ipv6_sockets) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv6_hashtbl) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv6_srchashtbl) = NULL; #define V_ipv6_sockets VNET(ipv6_sockets) #define V_ipv6_hashtbl VNET(ipv6_hashtbl) #define V_ipv6_srchashtbl VNET(ipv6_srchashtbl) #define GRE_HASH(src, dst) (V_ipv6_hashtbl[\ in6_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) #define GRE_SRCHASH(src) (V_ipv6_srchashtbl[\ fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) #define GRE_SOCKHASH(src) (V_ipv6_sockets[\ fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) #define GRE_HASH_SC(sc) GRE_HASH(&(sc)->gre_oip6.ip6_src,\ &(sc)->gre_oip6.ip6_dst) static uint32_t in6_gre_hashval(const struct in6_addr *src, const struct in6_addr *dst) { uint32_t ret; ret = fnv_32_buf(src, sizeof(*src), FNV1_32_INIT); return (fnv_32_buf(dst, sizeof(*dst), ret)); } static struct gre_socket* in6_gre_lookup_socket(const struct in6_addr *addr) { struct gre_socket *gs; struct in6_gre_socket *s; CK_LIST_FOREACH(gs, &GRE_SOCKHASH(addr), chain) { s = __containerof(gs, struct in6_gre_socket, base); if (IN6_ARE_ADDR_EQUAL(&s->addr, addr)) break; } return (gs); } static int in6_gre_checkdup(const struct gre_softc *sc, const struct in6_addr *src, const struct in6_addr *dst, uint32_t opts) { struct gre_list *head; struct gre_softc *tmp; struct gre_socket *gs; if (sc->gre_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, src) && IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, dst) && (sc->gre_options & GRE_UDPENCAP) == (opts & GRE_UDPENCAP)) return (EEXIST); if (opts & GRE_UDPENCAP) { gs = in6_gre_lookup_socket(src); if (gs == NULL) return (0); head = &gs->list; } else head = &GRE_HASH(src, dst); CK_LIST_FOREACH(tmp, head, chain) { if (tmp == sc) continue; if (IN6_ARE_ADDR_EQUAL(&tmp->gre_oip6.ip6_src, src) && IN6_ARE_ADDR_EQUAL(&tmp->gre_oip6.ip6_dst, dst)) return (EADDRNOTAVAIL); } return (0); } static int in6_gre_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip6_hdr *ip6; struct gre_softc *sc; if (V_ipv6_hashtbl == NULL) return (0); NET_EPOCH_ASSERT(); ip6 = mtod(m, const struct ip6_hdr *); CK_LIST_FOREACH(sc, &GRE_HASH(&ip6->ip6_dst, &ip6->ip6_src), chain) { /* * This is an inbound packet, its ip6_dst is source address * in softc. */ if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, &ip6->ip6_dst) && IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, &ip6->ip6_src)) { if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) return (0); *arg = sc; return (ENCAP_DRV_LOOKUP); } } return (0); } /* * Check that ingress address belongs to local host. */ static void in6_gre_set_running(struct gre_softc *sc) { if (in6_localip(&sc->gre_oip6.ip6_src)) GRE2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; else GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * ifaddr_event handler. * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent * source address spoofing. */ static void in6_gre_srcaddr(void *arg __unused, const struct sockaddr *sa, int event __unused) { const struct sockaddr_in6 *sin; struct gre_softc *sc; /* Check that VNET is ready */ if (V_ipv6_hashtbl == NULL) return; NET_EPOCH_ASSERT(); sin = (const struct sockaddr_in6 *)sa; CK_LIST_FOREACH(sc, &GRE_SRCHASH(&sin->sin6_addr), srchash) { if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, &sin->sin6_addr) == 0) continue; in6_gre_set_running(sc); } } static void in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct epoch_tracker et; struct gre_socket *gs; struct gre_softc *sc; struct sockaddr_in6 dst; NET_EPOCH_ENTER(et); /* * udp_append() holds reference to inp, it is safe to check * inp_flags2 without INP_RLOCK(). * If socket was closed before we have entered NET_EPOCH section, * INP_FREED flag should be set. Otherwise it should be safe to * make access to ctx data, because gre_so will be freed by - * gre_sofree() via epoch_call(). + * gre_sofree() via NET_EPOCH_CALL(). */ if (__predict_false(inp->inp_flags2 & INP_FREED)) { NET_EPOCH_EXIT(et); m_freem(m); return; } gs = (struct gre_socket *)ctx; dst = *(const struct sockaddr_in6 *)sa; if (sa6_embedscope(&dst, 0)) { NET_EPOCH_EXIT(et); m_freem(m); return; } CK_LIST_FOREACH(sc, &gs->list, chain) { if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, &dst.sin6_addr)) break; } if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); NET_EPOCH_EXIT(et); return; } m_freem(m); NET_EPOCH_EXIT(et); } static int in6_gre_setup_socket(struct gre_softc *sc) { struct sockopt sopt; struct sockaddr_in6 sin6; struct in6_gre_socket *s; struct gre_socket *gs; int error, value; /* * NOTE: we are protected with gre_ioctl_sx lock. * * First check that socket is already configured. * If so, check that source addres was not changed. * If address is different, check that there are no other tunnels * and close socket. */ gs = sc->gre_so; if (gs != NULL) { s = __containerof(gs, struct in6_gre_socket, base); if (!IN6_ARE_ADDR_EQUAL(&s->addr, &sc->gre_oip6.ip6_src)) { if (CK_LIST_EMPTY(&gs->list)) { CK_LIST_REMOVE(gs, chain); soclose(gs->so); - epoch_call(net_epoch_preempt, &gs->epoch_ctx, - gre_sofree); + NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx); } gs = sc->gre_so = NULL; } } if (gs == NULL) { /* * Check that socket for given address is already * configured. */ gs = in6_gre_lookup_socket(&sc->gre_oip6.ip6_src); if (gs == NULL) { s = malloc(sizeof(*s), M_GRE, M_WAITOK | M_ZERO); s->addr = sc->gre_oip6.ip6_src; gs = &s->base; error = socreate(sc->gre_family, &gs->so, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread); if (error != 0) { if_printf(GRE2IFP(sc), "cannot create socket: %d\n", error); free(s, M_GRE); return (error); } error = udp_set_kernel_tunneling(gs->so, in6_gre_udp_input, NULL, gs); if (error != 0) { if_printf(GRE2IFP(sc), "cannot set UDP tunneling: %d\n", error); goto fail; } memset(&sopt, 0, sizeof(sopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_IPV6; sopt.sopt_name = IPV6_BINDANY; sopt.sopt_val = &value; sopt.sopt_valsize = sizeof(value); value = 1; error = sosetopt(gs->so, &sopt); if (error != 0) { if_printf(GRE2IFP(sc), "cannot set IPV6_BINDANY opt: %d\n", error); goto fail; } memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->gre_oip6.ip6_src; sin6.sin6_port = htons(GRE_UDPPORT); error = sa6_recoverscope(&sin6); if (error != 0) { if_printf(GRE2IFP(sc), "cannot determine scope zone id: %d\n", error); goto fail; } error = sobind(gs->so, (struct sockaddr *)&sin6, curthread); if (error != 0) { if_printf(GRE2IFP(sc), "cannot bind socket: %d\n", error); goto fail; } /* Add socket to the chain */ CK_LIST_INSERT_HEAD( &GRE_SOCKHASH(&sc->gre_oip6.ip6_src), gs, chain); } } /* Add softc to the socket's list */ CK_LIST_INSERT_HEAD(&gs->list, sc, chain); sc->gre_so = gs; return (0); fail: soclose(gs->so); free(s, M_GRE); return (error); } static int in6_gre_attach(struct gre_softc *sc) { struct grehdr *gh; int error; if (sc->gre_options & GRE_UDPENCAP) { sc->gre_csumflags = CSUM_UDP_IPV6; sc->gre_hlen = sizeof(struct greudp6); sc->gre_oip6.ip6_nxt = IPPROTO_UDP; gh = &sc->gre_udp6hdr->gi6_gre; gre_update_udphdr(sc, &sc->gre_udp6, in6_cksum_pseudo(&sc->gre_oip6, 0, 0, 0)); } else { sc->gre_hlen = sizeof(struct greip6); sc->gre_oip6.ip6_nxt = IPPROTO_GRE; gh = &sc->gre_ip6hdr->gi6_gre; } sc->gre_oip6.ip6_vfc = IPV6_VERSION; gre_update_hdr(sc, gh); /* * If we return error, this means that sc is not linked, * and caller should reset gre_family and free(sc->gre_hdr). */ if (sc->gre_options & GRE_UDPENCAP) { error = in6_gre_setup_socket(sc); if (error != 0) return (error); } else CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); CK_LIST_INSERT_HEAD(&GRE_SRCHASH(&sc->gre_oip6.ip6_src), sc, srchash); /* Set IFF_DRV_RUNNING if interface is ready */ in6_gre_set_running(sc); return (0); } int in6_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) { int error; /* NOTE: we are protected with gre_ioctl_sx lock */ MPASS(cmd == GRESKEY || cmd == GRESOPTS || cmd == GRESPORT); MPASS(sc->gre_family == AF_INET6); /* * If we are going to change encapsulation protocol, do check * for duplicate tunnels. Return EEXIST here to do not confuse * user. */ if (cmd == GRESOPTS && (sc->gre_options & GRE_UDPENCAP) != (value & GRE_UDPENCAP) && in6_gre_checkdup(sc, &sc->gre_oip6.ip6_src, &sc->gre_oip6.ip6_dst, value) == EADDRNOTAVAIL) return (EEXIST); CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); switch (cmd) { case GRESKEY: sc->gre_key = value; break; case GRESOPTS: sc->gre_options = value; break; case GRESPORT: sc->gre_port = value; break; } error = in6_gre_attach(sc); if (error != 0) { sc->gre_family = 0; free(sc->gre_hdr, M_GRE); } return (error); } int in6_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct sockaddr_in6 *dst, *src; struct ip6_hdr *ip6; int error; /* NOTE: we are protected with gre_ioctl_sx lock */ error = EINVAL; switch (cmd) { case SIOCSIFPHYADDR_IN6: src = &((struct in6_aliasreq *)data)->ifra_addr; dst = &((struct in6_aliasreq *)data)->ifra_dstaddr; /* sanity checks */ if (src->sin6_family != dst->sin6_family || src->sin6_family != AF_INET6 || src->sin6_len != dst->sin6_len || src->sin6_len != sizeof(*src)) break; if (IN6_IS_ADDR_UNSPECIFIED(&src->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&dst->sin6_addr)) { error = EADDRNOTAVAIL; break; } /* * Check validity of the scope zone ID of the * addresses, and convert it into the kernel * internal form if necessary. */ if ((error = sa6_embedscope(src, 0)) != 0 || (error = sa6_embedscope(dst, 0)) != 0) break; if (V_ipv6_hashtbl == NULL) { V_ipv6_hashtbl = gre_hashinit(); V_ipv6_srchashtbl = gre_hashinit(); V_ipv6_sockets = (struct gre_sockets *)gre_hashinit(); } error = in6_gre_checkdup(sc, &src->sin6_addr, &dst->sin6_addr, sc->gre_options); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { /* Addresses are the same. Just return. */ error = 0; break; } ip6 = malloc(sizeof(struct greudp6) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip6->ip6_src = src->sin6_addr; ip6->ip6_dst = dst->sin6_addr; if (sc->gre_family != 0) { /* Detach existing tunnel first */ CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); free(sc->gre_hdr, M_GRE); /* XXX: should we notify about link state change? */ } sc->gre_family = AF_INET6; sc->gre_hdr = ip6; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; error = in6_gre_attach(sc); if (error != 0) { sc->gre_family = 0; free(sc->gre_hdr, M_GRE); } break; case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: if (sc->gre_family != AF_INET6) { error = EADDRNOTAVAIL; break; } src = (struct sockaddr_in6 *)&ifr->ifr_addr; memset(src, 0, sizeof(*src)); src->sin6_family = AF_INET6; src->sin6_len = sizeof(*src); src->sin6_addr = (cmd == SIOCGIFPSRCADDR_IN6) ? sc->gre_oip6.ip6_src: sc->gre_oip6.ip6_dst; error = prison_if(curthread->td_ucred, (struct sockaddr *)src); if (error == 0) error = sa6_recoverscope(src); if (error != 0) memset(src, 0, sizeof(*src)); break; } return (error); } int in6_gre_output(struct mbuf *m, int af __unused, int hlen __unused, uint32_t flowid) { struct greip6 *gi6; gi6 = mtod(m, struct greip6 *); gi6->gi6_ip6.ip6_hlim = V_ip6_gre_hlim; gi6->gi6_ip6.ip6_flow |= flowid & IPV6_FLOWLABEL_MASK; return (ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL, NULL)); } static const struct srcaddrtab *ipv6_srcaddrtab = NULL; static const struct encaptab *ecookie = NULL; static const struct encap_config ipv6_encap_cfg = { .proto = IPPROTO_GRE, .min_length = sizeof(struct greip6) + #ifdef INET sizeof(struct ip), #else sizeof(struct ip6_hdr), #endif .exact_match = ENCAP_DRV_LOOKUP, .lookup = in6_gre_lookup, .input = gre_input }; void in6_gre_init(void) { if (!IS_DEFAULT_VNET(curvnet)) return; ipv6_srcaddrtab = ip6_encap_register_srcaddr(in6_gre_srcaddr, NULL, M_WAITOK); ecookie = ip6_encap_attach(&ipv6_encap_cfg, NULL, M_WAITOK); } void in6_gre_uninit(void) { if (IS_DEFAULT_VNET(curvnet)) { ip6_encap_detach(ecookie); ip6_encap_unregister_srcaddr(ipv6_srcaddrtab); } if (V_ipv6_hashtbl != NULL) { gre_hashdestroy(V_ipv6_hashtbl); V_ipv6_hashtbl = NULL; GRE_WAIT(); gre_hashdestroy(V_ipv6_srchashtbl); gre_hashdestroy((struct gre_list *)V_ipv6_sockets); } } Index: head/sys/netpfil/ipfw/nat64/nat64lsn.c =================================================================== --- head/sys/netpfil/ipfw/nat64/nat64lsn.c (revision 356754) +++ head/sys/netpfil/ipfw/nat64/nat64lsn.c (revision 356755) @@ -1,1753 +1,1753 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015 Alexander V. Chernikov * Copyright (c) 2016-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nat64lsn.h" MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN"); #define NAT64LSN_EPOCH_ENTER(et) NET_EPOCH_ENTER(et) #define NAT64LSN_EPOCH_EXIT(et) NET_EPOCH_EXIT(et) #define NAT64LSN_EPOCH_ASSERT() NET_EPOCH_ASSERT() -#define NAT64LSN_EPOCH_CALL(c, f) epoch_call(net_epoch_preempt, (c), (f)) +#define NAT64LSN_EPOCH_CALL(c, f) NET_EPOCH_CALL((f), (c)) static uma_zone_t nat64lsn_host_zone; static uma_zone_t nat64lsn_pgchunk_zone; static uma_zone_t nat64lsn_pg_zone; static uma_zone_t nat64lsn_aliaslink_zone; static uma_zone_t nat64lsn_state_zone; static uma_zone_t nat64lsn_job_zone; static void nat64lsn_periodic(void *data); #define PERIODIC_DELAY 4 #define NAT64_LOOKUP(chain, cmd) \ (struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1) /* * Delayed job queue, used to create new hosts * and new portgroups */ enum nat64lsn_jtype { JTYPE_NEWHOST = 1, JTYPE_NEWPORTGROUP, JTYPE_DESTROY, }; struct nat64lsn_job_item { STAILQ_ENTRY(nat64lsn_job_item) entries; enum nat64lsn_jtype jtype; union { struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */ struct mbuf *m; struct nat64lsn_host *host; struct nat64lsn_state *state; uint32_t src6_hval; uint32_t state_hval; struct ipfw_flow_id f_id; in_addr_t faddr; uint16_t port; uint8_t proto; uint8_t done; }; struct { /* used by JTYPE_DESTROY */ struct nat64lsn_hosts_slist hosts; struct nat64lsn_pg_slist portgroups; struct nat64lsn_pgchunk *pgchunk; struct epoch_context epoch_ctx; }; }; }; static struct mtx jmtx; #define JQUEUE_LOCK_INIT() mtx_init(&jmtx, "qlock", NULL, MTX_DEF) #define JQUEUE_LOCK_DESTROY() mtx_destroy(&jmtx) #define JQUEUE_LOCK() mtx_lock(&jmtx) #define JQUEUE_UNLOCK() mtx_unlock(&jmtx) static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); static struct nat64lsn_job_item *nat64lsn_create_job( struct nat64lsn_cfg *cfg, int jtype); static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); static void nat64lsn_job_destroy(epoch_context_t ctx); static void nat64lsn_destroy_host(struct nat64lsn_host *host); static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg); static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, struct mbuf **mp); static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id, struct mbuf **mp); static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags); #define NAT64_BIT_TCP_FIN 0 /* FIN was seen */ #define NAT64_BIT_TCP_SYN 1 /* First syn in->out */ #define NAT64_BIT_TCP_ESTAB 2 /* Packet with Ack */ #define NAT64_BIT_READY_IPV4 6 /* state is ready for translate4 */ #define NAT64_BIT_STALE 7 /* state is going to be expired */ #define NAT64_FLAG_FIN (1 << NAT64_BIT_TCP_FIN) #define NAT64_FLAG_SYN (1 << NAT64_BIT_TCP_SYN) #define NAT64_FLAG_ESTAB (1 << NAT64_BIT_TCP_ESTAB) #define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) #define NAT64_FLAG_READY (1 << NAT64_BIT_READY_IPV4) #define NAT64_FLAG_STALE (1 << NAT64_BIT_STALE) static inline uint8_t convert_tcp_flags(uint8_t flags) { uint8_t result; result = flags & (TH_FIN|TH_SYN); result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ return (result); } static void nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, struct nat64lsn_state *state) { memset(plog, 0, sizeof(*plog)); plog->length = PFLOG_REAL_HDRLEN; plog->af = family; plog->action = PF_NAT; plog->dir = PF_IN; plog->rulenr = htonl(state->ip_src); plog->subrulenr = htonl((uint32_t)(state->aport << 16) | (state->proto << 8) | (state->ip_dst & 0xff)); plog->ruleset[0] = '\0'; strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); } #define HVAL(p, n, s) jenkins_hash32((const uint32_t *)(p), (n), (s)) #define HOST_HVAL(c, a) HVAL((a),\ sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed) #define HOSTS(c, v) ((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)]) #define ALIASLINK_HVAL(c, f) HVAL(&(f)->dst_ip6,\ sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed) #define ALIAS_BYHASH(c, v) \ ((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)]) static struct nat64lsn_aliaslink* nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused, struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused) { /* * We can implement some different algorithms how * select an alias address. * XXX: for now we use first available. */ return (CK_SLIST_FIRST(&host->aliases)); } #define STATE_HVAL(c, d) HVAL((d), 2, (c)->hash_seed) #define STATE_HASH(h, v) \ ((h)->states_hash[(v) & ((h)->states_hashsize - 1)]) #define STATES_CHUNK(p, v) \ ((p)->chunks_count == 1 ? (p)->states : \ ((p)->states_chunk[CHUNK_BY_FADDR(p, v)])) #ifdef __LP64__ #define FREEMASK_FFSLL(pg, faddr) \ ffsll(*FREEMASK_CHUNK((pg), (faddr))) #define FREEMASK_BTR(pg, faddr, bit) \ ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) #define FREEMASK_BTS(pg, faddr, bit) \ ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) #define FREEMASK_ISSET(pg, faddr, bit) \ ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit)) #define FREEMASK_COPY(pg, n, out) \ (out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n))) #else static inline int freemask_ffsll(uint32_t *freemask) { int i; if ((i = ffsl(freemask[0])) != 0) return (i); if ((i = ffsl(freemask[1])) != 0) return (i + 32); return (0); } #define FREEMASK_FFSLL(pg, faddr) \ freemask_ffsll(FREEMASK_CHUNK((pg), (faddr))) #define FREEMASK_BTR(pg, faddr, bit) \ ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) #define FREEMASK_BTS(pg, faddr, bit) \ ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) #define FREEMASK_ISSET(pg, faddr, bit) \ ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32) #define FREEMASK_COPY(pg, n, out) \ (out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \ ((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32) #endif /* !__LP64__ */ #define NAT64LSN_TRY_PGCNT 32 static struct nat64lsn_pg* nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask, struct nat64lsn_pgchunk **chunks, struct nat64lsn_pg **pgptr, uint32_t *pgidx, in_addr_t faddr) { struct nat64lsn_pg *pg, *oldpg; uint32_t idx, oldidx; int cnt; cnt = 0; /* First try last used PG */ oldpg = pg = ck_pr_load_ptr(pgptr); idx = oldidx = ck_pr_load_32(pgidx); /* If pgidx is out of range, reset it to the first pgchunk */ if (!ISSET32(*chunkmask, idx / 32)) idx = 0; do { ck_pr_fence_load(); if (pg != NULL && FREEMASK_BITCOUNT(pg, faddr) > 0) { /* * If last used PG has not free states, * try to update pointer. * NOTE: it can be already updated by jobs handler, * thus we use CAS operation. */ if (cnt > 0) ck_pr_cas_ptr(pgptr, oldpg, pg); return (pg); } /* Stop if idx is out of range */ if (!ISSET32(*chunkmask, idx / 32)) break; if (ISSET32(pgmask[idx / 32], idx % 32)) pg = ck_pr_load_ptr( &chunks[idx / 32]->pgptr[idx % 32]); else pg = NULL; idx++; } while (++cnt < NAT64LSN_TRY_PGCNT); /* If pgidx is out of range, reset it to the first pgchunk */ if (!ISSET32(*chunkmask, idx / 32)) idx = 0; ck_pr_cas_32(pgidx, oldidx, idx); return (NULL); } static struct nat64lsn_state* nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_aliaslink *link; struct nat64lsn_state *state; struct nat64lsn_pg *pg; int i, offset; NAT64LSN_EPOCH_ASSERT(); /* Check that we already have state for given arguments */ CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) { if (state->proto == proto && state->ip_dst == faddr && state->sport == port && state->dport == f_id->dst_port) return (state); } link = nat64lsn_get_aliaslink(cfg, host, f_id); if (link == NULL) return (NULL); switch (proto) { case IPPROTO_TCP: pg = nat64lsn_get_pg( &link->alias->tcp_chunkmask, link->alias->tcp_pgmask, link->alias->tcp, &link->alias->tcp_pg, &link->alias->tcp_pgidx, faddr); break; case IPPROTO_UDP: pg = nat64lsn_get_pg( &link->alias->udp_chunkmask, link->alias->udp_pgmask, link->alias->udp, &link->alias->udp_pg, &link->alias->udp_pgidx, faddr); break; case IPPROTO_ICMP: pg = nat64lsn_get_pg( &link->alias->icmp_chunkmask, link->alias->icmp_pgmask, link->alias->icmp, &link->alias->icmp_pg, &link->alias->icmp_pgidx, faddr); break; default: panic("%s: wrong proto %d", __func__, proto); } if (pg == NULL) return (NULL); /* Check that PG has some free states */ state = NULL; i = FREEMASK_BITCOUNT(pg, faddr); while (i-- > 0) { offset = FREEMASK_FFSLL(pg, faddr); if (offset == 0) { /* * We lost the race. * No more free states in this PG. */ break; } /* Lets try to atomically grab the state */ if (FREEMASK_BTR(pg, faddr, offset - 1)) { state = &STATES_CHUNK(pg, faddr)->state[offset - 1]; /* Initialize */ state->flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags); state->proto = proto; state->aport = pg->base_port + offset - 1; state->dport = f_id->dst_port; state->sport = port; state->ip6_dst = f_id->dst_ip6; state->ip_dst = faddr; state->ip_src = link->alias->addr; state->hval = hval; state->host = host; SET_AGE(state->timestamp); /* Insert new state into host's hash table */ HOST_LOCK(host); CK_SLIST_INSERT_HEAD(&STATE_HASH(host, hval), state, entries); host->states_count++; /* * XXX: In case if host is going to be expired, * reset NAT64LSN_DEADHOST flag. */ host->flags &= ~NAT64LSN_DEADHOST; HOST_UNLOCK(host); NAT64STAT_INC(&cfg->base.stats, screated); /* Mark the state as ready for translate4 */ ck_pr_fence_store(); ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4); break; } } return (state); } /* * Inspects icmp packets to see if the message contains different * packet header so we need to alter @addr and @port. */ static int inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr, uint16_t *port) { struct icmp *icmp; struct ip *ip; int off; uint8_t inner_proto; ip = mtod(*mp, struct ip *); /* Outer IP header */ off = (ip->ip_hl << 2) + ICMP_MINLEN; if ((*mp)->m_len < off) *mp = m_pullup(*mp, off); if (*mp == NULL) return (ENOMEM); ip = mtod(*mp, struct ip *); /* Outer IP header */ icmp = L3HDR(ip, struct icmp *); switch (icmp->icmp_type) { case ICMP_ECHO: case ICMP_ECHOREPLY: /* Use icmp ID as distinguisher */ *port = ntohs(icmp->icmp_id); return (0); case ICMP_UNREACH: case ICMP_TIMXCEED: break; default: return (EOPNOTSUPP); } /* * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits * of ULP header. */ if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) return (EINVAL); if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) *mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN); if (*mp == NULL) return (ENOMEM); ip = mtodo(*mp, off); /* Inner IP header */ inner_proto = ip->ip_p; off += ip->ip_hl << 2; /* Skip inner IP header */ *addr = ntohl(ip->ip_src.s_addr); if ((*mp)->m_len < off + ICMP_MINLEN) *mp = m_pullup(*mp, off + ICMP_MINLEN); if (*mp == NULL) return (ENOMEM); switch (inner_proto) { case IPPROTO_TCP: case IPPROTO_UDP: /* Copy source port from the header */ *port = ntohs(*((uint16_t *)mtodo(*mp, off))); *proto = inner_proto; return (0); case IPPROTO_ICMP: /* * We will translate only ICMP errors for our ICMP * echo requests. */ icmp = mtodo(*mp, off); if (icmp->icmp_type != ICMP_ECHO) return (EOPNOTSUPP); *port = ntohs(icmp->icmp_id); return (0); }; return (EOPNOTSUPP); } static struct nat64lsn_state* nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_state *state; struct nat64lsn_pg *pg; int chunk_idx, pg_idx, state_idx; NAT64LSN_EPOCH_ASSERT(); if (port < NAT64_MIN_PORT) return (NULL); /* * Alias keeps 32 pgchunks for each protocol. * Each pgchunk has 32 pointers to portgroup. * Each portgroup has 64 states for ports. */ port -= NAT64_MIN_PORT; chunk_idx = port / 2048; port -= chunk_idx * 2048; pg_idx = port / 64; state_idx = port % 64; /* * First check in proto_chunkmask that we have allocated PG chunk. * Then check in proto_pgmask that we have valid PG pointer. */ pg = NULL; switch (proto) { case IPPROTO_TCP: if (ISSET32(alias->tcp_chunkmask, chunk_idx) && ISSET32(alias->tcp_pgmask[chunk_idx], pg_idx)) { pg = alias->tcp[chunk_idx]->pgptr[pg_idx]; break; } return (NULL); case IPPROTO_UDP: if (ISSET32(alias->udp_chunkmask, chunk_idx) && ISSET32(alias->udp_pgmask[chunk_idx], pg_idx)) { pg = alias->udp[chunk_idx]->pgptr[pg_idx]; break; } return (NULL); case IPPROTO_ICMP: if (ISSET32(alias->icmp_chunkmask, chunk_idx) && ISSET32(alias->icmp_pgmask[chunk_idx], pg_idx)) { pg = alias->icmp[chunk_idx]->pgptr[pg_idx]; break; } return (NULL); default: panic("%s: wrong proto %d", __func__, proto); } if (pg == NULL) return (NULL); if (FREEMASK_ISSET(pg, faddr, state_idx)) return (NULL); state = &STATES_CHUNK(pg, faddr)->state[state_idx]; ck_pr_fence_load(); if (ck_pr_load_32(&state->flags) & NAT64_FLAG_READY) return (state); return (NULL); } static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, struct mbuf **mp) { struct pfloghdr loghdr, *logdata; struct in6_addr src6; struct nat64lsn_state *state; struct nat64lsn_alias *alias; uint32_t addr, flags; uint16_t port, ts; int ret; uint8_t proto; addr = f_id->dst_ip; port = f_id->dst_port; proto = f_id->proto; if (addr < cfg->prefix4 || addr > cfg->pmask4) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } /* Check if protocol is supported */ switch (proto) { case IPPROTO_ICMP: ret = inspect_icmp_mbuf(mp, &proto, &addr, &port); if (ret != 0) { if (ret == ENOMEM) { NAT64STAT_INC(&cfg->base.stats, nomem); return (IP_FW_DENY); } NAT64STAT_INC(&cfg->base.stats, noproto); return (cfg->nomatch_verdict); } if (addr < cfg->prefix4 || addr > cfg->pmask4) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } /* FALLTHROUGH */ case IPPROTO_TCP: case IPPROTO_UDP: break; default: NAT64STAT_INC(&cfg->base.stats, noproto); return (cfg->nomatch_verdict); } alias = &ALIAS_BYHASH(cfg, addr); MPASS(addr == alias->addr); /* Check that we have state for this port */ state = nat64lsn_get_state4to6(cfg, alias, f_id->src_ip, port, proto); if (state == NULL) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } /* TODO: Check flags to see if we need to do some static mapping */ /* Update some state fields if need */ SET_AGE(ts); if (f_id->proto == IPPROTO_TCP) flags = convert_tcp_flags(f_id->_flags); else flags = 0; if (state->timestamp != ts) state->timestamp = ts; if ((state->flags & flags) != flags) state->flags |= flags; port = htons(state->sport); src6 = state->ip6_dst; if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64lsn_log(logdata, *mp, AF_INET, state); } else logdata = NULL; /* * We already have src6 with embedded address, but it is possible, * that src_ip is different than state->ip_dst, this is why we * do embedding again. */ nat64_embed_ip4(&src6, cfg->base.plat_plen, htonl(f_id->src_ip)); ret = nat64_do_handle_ip4(*mp, &src6, &state->host->addr, port, &cfg->base, logdata); if (ret == NAT64SKIP) return (cfg->nomatch_verdict); if (ret == NAT64RETURN) *mp = NULL; return (IP_FW_DENY); } /* * Check if particular state is stale and should be deleted. * Return 1 if true, 0 otherwise. */ static int nat64lsn_check_state(struct nat64lsn_cfg *cfg, struct nat64lsn_state *state) { int age, ttl; /* State was marked as stale in previous pass. */ if (ISSET32(state->flags, NAT64_BIT_STALE)) return (1); /* State is not yet initialized, it is going to be READY */ if (!ISSET32(state->flags, NAT64_BIT_READY_IPV4)) return (0); age = GET_AGE(state->timestamp); switch (state->proto) { case IPPROTO_TCP: if (ISSET32(state->flags, NAT64_BIT_TCP_FIN)) ttl = cfg->st_close_ttl; else if (ISSET32(state->flags, NAT64_BIT_TCP_ESTAB)) ttl = cfg->st_estab_ttl; else if (ISSET32(state->flags, NAT64_BIT_TCP_SYN)) ttl = cfg->st_syn_ttl; else ttl = cfg->st_syn_ttl; if (age > ttl) return (1); break; case IPPROTO_UDP: if (age > cfg->st_udp_ttl) return (1); break; case IPPROTO_ICMP: if (age > cfg->st_icmp_ttl) return (1); break; } return (0); } static int nat64lsn_maintain_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg) { struct nat64lsn_state *state; struct nat64lsn_host *host; uint64_t freemask; int c, i, update_age; update_age = 0; for (c = 0; c < pg->chunks_count; c++) { FREEMASK_COPY(pg, c, freemask); for (i = 0; i < 64; i++) { if (ISSET64(freemask, i)) continue; state = &STATES_CHUNK(pg, c)->state[i]; if (nat64lsn_check_state(cfg, state) == 0) { update_age = 1; continue; } /* * Expire state: * 1. Mark as STALE and unlink from host's hash. * 2. Set bit in freemask. */ if (ISSET32(state->flags, NAT64_BIT_STALE)) { /* * State was marked as STALE in previous * pass. Now it is safe to release it. */ state->flags = 0; ck_pr_fence_store(); FREEMASK_BTS(pg, c, i); NAT64STAT_INC(&cfg->base.stats, sdeleted); continue; } MPASS(state->flags & NAT64_FLAG_READY); host = state->host; HOST_LOCK(host); CK_SLIST_REMOVE(&STATE_HASH(host, state->hval), state, nat64lsn_state, entries); host->states_count--; HOST_UNLOCK(host); /* Reset READY flag */ ck_pr_btr_32(&state->flags, NAT64_BIT_READY_IPV4); /* And set STALE flag */ ck_pr_bts_32(&state->flags, NAT64_BIT_STALE); ck_pr_fence_store(); /* * Now translate6 will not use this state, wait * until it become safe for translate4, then mark * state as free. */ } } /* * We have some alive states, update timestamp. */ if (update_age) SET_AGE(pg->timestamp); if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay) return (0); return (1); } static void nat64lsn_expire_portgroups(struct nat64lsn_cfg *cfg, struct nat64lsn_pg_slist *portgroups) { struct nat64lsn_alias *alias; struct nat64lsn_pg *pg, *tpg, *firstpg, **pgptr; uint32_t *pgmask, *pgidx; int i, idx; for (i = 0; i < 1 << (32 - cfg->plen4); i++) { alias = &cfg->aliases[i]; CK_SLIST_FOREACH_SAFE(pg, &alias->portgroups, entries, tpg) { if (nat64lsn_maintain_pg(cfg, pg) == 0) continue; /* Always keep first PG */ if (pg->base_port == NAT64_MIN_PORT) continue; /* * PG is expired, unlink it and schedule for * deferred destroying. */ idx = (pg->base_port - NAT64_MIN_PORT) / 64; switch (pg->proto) { case IPPROTO_TCP: pgmask = alias->tcp_pgmask; pgptr = &alias->tcp_pg; pgidx = &alias->tcp_pgidx; firstpg = alias->tcp[0]->pgptr[0]; break; case IPPROTO_UDP: pgmask = alias->udp_pgmask; pgptr = &alias->udp_pg; pgidx = &alias->udp_pgidx; firstpg = alias->udp[0]->pgptr[0]; break; case IPPROTO_ICMP: pgmask = alias->icmp_pgmask; pgptr = &alias->icmp_pg; pgidx = &alias->icmp_pgidx; firstpg = alias->icmp[0]->pgptr[0]; break; } /* Reset the corresponding bit in pgmask array. */ ck_pr_btr_32(&pgmask[idx / 32], idx % 32); ck_pr_fence_store(); /* If last used PG points to this PG, reset it. */ ck_pr_cas_ptr(pgptr, pg, firstpg); ck_pr_cas_32(pgidx, idx, 0); /* Unlink PG from alias's chain */ ALIAS_LOCK(alias); CK_SLIST_REMOVE(&alias->portgroups, pg, nat64lsn_pg, entries); alias->portgroups_count--; ALIAS_UNLOCK(alias); /* And link to job's chain for deferred destroying */ NAT64STAT_INC(&cfg->base.stats, spgdeleted); CK_SLIST_INSERT_HEAD(portgroups, pg, entries); } } } static void nat64lsn_expire_hosts(struct nat64lsn_cfg *cfg, struct nat64lsn_hosts_slist *hosts) { struct nat64lsn_host *host, *tmp; int i; for (i = 0; i < cfg->hosts_hashsize; i++) { CK_SLIST_FOREACH_SAFE(host, &cfg->hosts_hash[i], entries, tmp) { /* Is host was marked in previous call? */ if (host->flags & NAT64LSN_DEADHOST) { if (host->states_count > 0) { host->flags &= ~NAT64LSN_DEADHOST; continue; } /* * Unlink host from hash table and schedule * it for deferred destroying. */ CFG_LOCK(cfg); CK_SLIST_REMOVE(&cfg->hosts_hash[i], host, nat64lsn_host, entries); cfg->hosts_count--; CFG_UNLOCK(cfg); CK_SLIST_INSERT_HEAD(hosts, host, entries); continue; } if (GET_AGE(host->timestamp) < cfg->host_delete_delay) continue; if (host->states_count > 0) continue; /* Mark host as going to be expired in next pass */ host->flags |= NAT64LSN_DEADHOST; ck_pr_fence_store(); } } } static struct nat64lsn_pgchunk* nat64lsn_expire_pgchunk(struct nat64lsn_cfg *cfg) { #if 0 struct nat64lsn_alias *alias; struct nat64lsn_pgchunk *chunk; uint32_t pgmask; int i, c; for (i = 0; i < 1 << (32 - cfg->plen4); i++) { alias = &cfg->aliases[i]; if (GET_AGE(alias->timestamp) < cfg->pgchunk_delete_delay) continue; /* Always keep single chunk allocated */ for (c = 1; c < 32; c++) { if ((alias->tcp_chunkmask & (1 << c)) == 0) break; chunk = ck_pr_load_ptr(&alias->tcp[c]); if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) continue; ck_pr_btr_32(&alias->tcp_chunkmask, c); ck_pr_fence_load(); if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) continue; } } #endif return (NULL); } #if 0 static void nat64lsn_maintain_hosts(struct nat64lsn_cfg *cfg) { struct nat64lsn_host *h; struct nat64lsn_states_slist *hash; int i, j, hsize; for (i = 0; i < cfg->hosts_hashsize; i++) { CK_SLIST_FOREACH(h, &cfg->hosts_hash[i], entries) { if (h->states_count / 2 < h->states_hashsize || h->states_hashsize >= NAT64LSN_MAX_HSIZE) continue; hsize = h->states_hashsize * 2; hash = malloc(sizeof(*hash)* hsize, M_NOWAIT); if (hash == NULL) continue; for (j = 0; j < hsize; j++) CK_SLIST_INIT(&hash[i]); ck_pr_bts_32(&h->flags, NAT64LSN_GROWHASH); } } } #endif /* * This procedure is used to perform various maintance * on dynamic hash list. Currently it is called every 4 seconds. */ static void nat64lsn_periodic(void *data) { struct nat64lsn_job_item *ji; struct nat64lsn_cfg *cfg; cfg = (struct nat64lsn_cfg *) data; CURVNET_SET(cfg->vp); if (cfg->hosts_count > 0) { ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); if (ji != NULL) { ji->jtype = JTYPE_DESTROY; CK_SLIST_INIT(&ji->hosts); CK_SLIST_INIT(&ji->portgroups); nat64lsn_expire_hosts(cfg, &ji->hosts); nat64lsn_expire_portgroups(cfg, &ji->portgroups); ji->pgchunk = nat64lsn_expire_pgchunk(cfg); NAT64LSN_EPOCH_CALL(&ji->epoch_ctx, nat64lsn_job_destroy); } else NAT64STAT_INC(&cfg->base.stats, jnomem); } callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY); CURVNET_RESTORE(); } #define ALLOC_ERROR(stage, type) ((stage) ? 10 * (type) + (stage): 0) #define HOST_ERROR(stage) ALLOC_ERROR(stage, 1) #define PG_ERROR(stage) ALLOC_ERROR(stage, 2) static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { char a[INET6_ADDRSTRLEN]; struct nat64lsn_aliaslink *link; struct nat64lsn_host *host; struct nat64lsn_state *state; uint32_t hval, data[2]; int i; /* Check that host was not yet added. */ NAT64LSN_EPOCH_ASSERT(); CK_SLIST_FOREACH(host, &HOSTS(cfg, ji->src6_hval), entries) { if (IN6_ARE_ADDR_EQUAL(&ji->f_id.src_ip6, &host->addr)) { /* The host was allocated in previous call. */ ji->host = host; goto get_state; } } host = ji->host = uma_zalloc(nat64lsn_host_zone, M_NOWAIT); if (ji->host == NULL) return (HOST_ERROR(1)); host->states_hashsize = NAT64LSN_HSIZE; host->states_hash = malloc(sizeof(struct nat64lsn_states_slist) * host->states_hashsize, M_NAT64LSN, M_NOWAIT); if (host->states_hash == NULL) { uma_zfree(nat64lsn_host_zone, host); return (HOST_ERROR(2)); } link = uma_zalloc(nat64lsn_aliaslink_zone, M_NOWAIT); if (link == NULL) { free(host->states_hash, M_NAT64LSN); uma_zfree(nat64lsn_host_zone, host); return (HOST_ERROR(3)); } /* Initialize */ HOST_LOCK_INIT(host); SET_AGE(host->timestamp); host->addr = ji->f_id.src_ip6; host->hval = ji->src6_hval; host->flags = 0; host->states_count = 0; host->states_hashsize = NAT64LSN_HSIZE; CK_SLIST_INIT(&host->aliases); for (i = 0; i < host->states_hashsize; i++) CK_SLIST_INIT(&host->states_hash[i]); /* Determine alias from flow hash. */ hval = ALIASLINK_HVAL(cfg, &ji->f_id); link->alias = &ALIAS_BYHASH(cfg, hval); CK_SLIST_INSERT_HEAD(&host->aliases, link, host_entries); ALIAS_LOCK(link->alias); CK_SLIST_INSERT_HEAD(&link->alias->hosts, link, alias_entries); link->alias->hosts_count++; ALIAS_UNLOCK(link->alias); CFG_LOCK(cfg); CK_SLIST_INSERT_HEAD(&HOSTS(cfg, ji->src6_hval), host, entries); cfg->hosts_count++; CFG_UNLOCK(cfg); get_state: data[0] = ji->faddr; data[1] = (ji->f_id.dst_port << 16) | ji->port; ji->state_hval = hval = STATE_HVAL(cfg, data); state = nat64lsn_get_state6to4(cfg, host, &ji->f_id, hval, ji->faddr, ji->port, ji->proto); /* * We failed to obtain new state, used alias needs new PG. * XXX: or another alias should be used. */ if (state == NULL) { /* Try to allocate new PG */ if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) return (HOST_ERROR(4)); /* We assume that nat64lsn_alloc_pg() got state */ } else ji->state = state; ji->done = 1; DPRINTF(DP_OBJ, "ALLOC HOST %s %p", inet_ntop(AF_INET6, &host->addr, a, sizeof(a)), host); return (HOST_ERROR(0)); } static int nat64lsn_find_pg_place(uint32_t *data) { int i; for (i = 0; i < 32; i++) { if (~data[i] == 0) continue; return (i * 32 + ffs(~data[i]) - 1); } return (-1); } static int nat64lsn_alloc_proto_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias, uint32_t *chunkmask, uint32_t *pgmask, struct nat64lsn_pgchunk **chunks, struct nat64lsn_pg **pgptr, uint8_t proto) { struct nat64lsn_pg *pg; int i, pg_idx, chunk_idx; /* Find place in pgchunk where PG can be added */ pg_idx = nat64lsn_find_pg_place(pgmask); if (pg_idx < 0) /* no more PGs */ return (PG_ERROR(1)); /* Check that we have allocated pgchunk for given PG index */ chunk_idx = pg_idx / 32; if (!ISSET32(*chunkmask, chunk_idx)) { chunks[chunk_idx] = uma_zalloc(nat64lsn_pgchunk_zone, M_NOWAIT); if (chunks[chunk_idx] == NULL) return (PG_ERROR(2)); ck_pr_bts_32(chunkmask, chunk_idx); ck_pr_fence_store(); } /* Allocate PG and states chunks */ pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT); if (pg == NULL) return (PG_ERROR(3)); pg->chunks_count = cfg->states_chunks; if (pg->chunks_count > 1) { pg->freemask_chunk = malloc(pg->chunks_count * sizeof(uint64_t), M_NAT64LSN, M_NOWAIT); if (pg->freemask_chunk == NULL) { uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(4)); } pg->states_chunk = malloc(pg->chunks_count * sizeof(struct nat64lsn_states_chunk *), M_NAT64LSN, M_NOWAIT | M_ZERO); if (pg->states_chunk == NULL) { free(pg->freemask_chunk, M_NAT64LSN); uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(5)); } for (i = 0; i < pg->chunks_count; i++) { pg->states_chunk[i] = uma_zalloc( nat64lsn_state_zone, M_NOWAIT); if (pg->states_chunk[i] == NULL) goto states_failed; } memset(pg->freemask_chunk, 0xff, sizeof(uint64_t) * pg->chunks_count); } else { pg->states = uma_zalloc(nat64lsn_state_zone, M_NOWAIT); if (pg->states == NULL) { uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(6)); } memset(&pg->freemask64, 0xff, sizeof(uint64_t)); } /* Initialize PG and hook it to pgchunk */ SET_AGE(pg->timestamp); pg->proto = proto; pg->base_port = NAT64_MIN_PORT + 64 * pg_idx; ck_pr_store_ptr(&chunks[chunk_idx]->pgptr[pg_idx % 32], pg); ck_pr_fence_store(); ck_pr_bts_32(&pgmask[pg_idx / 32], pg_idx % 32); ck_pr_store_ptr(pgptr, pg); ALIAS_LOCK(alias); CK_SLIST_INSERT_HEAD(&alias->portgroups, pg, entries); SET_AGE(alias->timestamp); alias->portgroups_count++; ALIAS_UNLOCK(alias); NAT64STAT_INC(&cfg->base.stats, spgcreated); return (PG_ERROR(0)); states_failed: for (i = 0; i < pg->chunks_count; i++) uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); free(pg->freemask_chunk, M_NAT64LSN); free(pg->states_chunk, M_NAT64LSN); uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(7)); } static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { struct nat64lsn_aliaslink *link; struct nat64lsn_alias *alias; int ret; link = nat64lsn_get_aliaslink(cfg, ji->host, &ji->f_id); if (link == NULL) return (PG_ERROR(1)); /* * TODO: check that we did not already allocated PG in * previous call. */ ret = 0; alias = link->alias; /* Find place in pgchunk where PG can be added */ switch (ji->proto) { case IPPROTO_TCP: ret = nat64lsn_alloc_proto_pg(cfg, alias, &alias->tcp_chunkmask, alias->tcp_pgmask, alias->tcp, &alias->tcp_pg, ji->proto); break; case IPPROTO_UDP: ret = nat64lsn_alloc_proto_pg(cfg, alias, &alias->udp_chunkmask, alias->udp_pgmask, alias->udp, &alias->udp_pg, ji->proto); break; case IPPROTO_ICMP: ret = nat64lsn_alloc_proto_pg(cfg, alias, &alias->icmp_chunkmask, alias->icmp_pgmask, alias->icmp, &alias->icmp_pg, ji->proto); break; default: panic("%s: wrong proto %d", __func__, ji->proto); } if (ret == PG_ERROR(1)) { /* * PG_ERROR(1) means that alias lacks free PGs * XXX: try next alias. */ printf("NAT64LSN: %s: failed to obtain PG\n", __func__); return (ret); } if (ret == PG_ERROR(0)) { ji->state = nat64lsn_get_state6to4(cfg, ji->host, &ji->f_id, ji->state_hval, ji->faddr, ji->port, ji->proto); if (ji->state == NULL) ret = PG_ERROR(8); else ji->done = 1; } return (ret); } static void nat64lsn_do_request(void *data) { struct epoch_tracker et; struct nat64lsn_job_head jhead; struct nat64lsn_job_item *ji, *ji2; struct nat64lsn_cfg *cfg; int jcount; uint8_t flags; cfg = (struct nat64lsn_cfg *)data; if (cfg->jlen == 0) return; CURVNET_SET(cfg->vp); STAILQ_INIT(&jhead); /* Grab queue */ JQUEUE_LOCK(); STAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item); jcount = cfg->jlen; cfg->jlen = 0; JQUEUE_UNLOCK(); /* TODO: check if we need to resize hash */ NAT64STAT_INC(&cfg->base.stats, jcalls); DPRINTF(DP_JQUEUE, "count=%d", jcount); /* * TODO: * What we should do here is to build a hash * to ensure we don't have lots of duplicate requests. * Skip this for now. * * TODO: Limit per-call number of items */ NAT64LSN_EPOCH_ENTER(et); STAILQ_FOREACH(ji, &jhead, entries) { switch (ji->jtype) { case JTYPE_NEWHOST: if (nat64lsn_alloc_host(cfg, ji) != HOST_ERROR(0)) NAT64STAT_INC(&cfg->base.stats, jhostfails); break; case JTYPE_NEWPORTGROUP: if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) NAT64STAT_INC(&cfg->base.stats, jportfails); break; default: continue; } if (ji->done != 0) { flags = ji->proto != IPPROTO_TCP ? 0 : convert_tcp_flags(ji->f_id._flags); nat64lsn_translate6_internal(cfg, &ji->m, ji->state, flags); NAT64STAT_INC(&cfg->base.stats, jreinjected); } } NAT64LSN_EPOCH_EXIT(et); ji = STAILQ_FIRST(&jhead); while (ji != NULL) { ji2 = STAILQ_NEXT(ji, entries); /* * In any case we must free mbuf if * translator did not consumed it. */ m_freem(ji->m); uma_zfree(nat64lsn_job_zone, ji); ji = ji2; } CURVNET_RESTORE(); } static struct nat64lsn_job_item * nat64lsn_create_job(struct nat64lsn_cfg *cfg, int jtype) { struct nat64lsn_job_item *ji; /* * Do not try to lock possibly contested mutex if we're near the * limit. Drop packet instead. */ ji = NULL; if (cfg->jlen >= cfg->jmaxlen) NAT64STAT_INC(&cfg->base.stats, jmaxlen); else { ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); if (ji == NULL) NAT64STAT_INC(&cfg->base.stats, jnomem); } if (ji == NULL) { NAT64STAT_INC(&cfg->base.stats, dropped); DPRINTF(DP_DROPS, "failed to create job"); } else { ji->jtype = jtype; ji->done = 0; } return (ji); } static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { JQUEUE_LOCK(); STAILQ_INSERT_TAIL(&cfg->jhead, ji, entries); NAT64STAT_INC(&cfg->base.stats, jrequests); cfg->jlen++; if (callout_pending(&cfg->jcallout) == 0) callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg); JQUEUE_UNLOCK(); } static void nat64lsn_job_destroy(epoch_context_t ctx) { struct nat64lsn_job_item *ji; struct nat64lsn_host *host; struct nat64lsn_pg *pg; int i; ji = __containerof(ctx, struct nat64lsn_job_item, epoch_ctx); MPASS(ji->jtype == JTYPE_DESTROY); while (!CK_SLIST_EMPTY(&ji->hosts)) { host = CK_SLIST_FIRST(&ji->hosts); CK_SLIST_REMOVE_HEAD(&ji->hosts, entries); if (host->states_count > 0) { /* * XXX: The state has been created * during host deletion. */ printf("NAT64LSN: %s: destroying host with %d " "states\n", __func__, host->states_count); } nat64lsn_destroy_host(host); } while (!CK_SLIST_EMPTY(&ji->portgroups)) { pg = CK_SLIST_FIRST(&ji->portgroups); CK_SLIST_REMOVE_HEAD(&ji->portgroups, entries); for (i = 0; i < pg->chunks_count; i++) { if (FREEMASK_BITCOUNT(pg, i) != 64) { /* * XXX: The state has been created during * PG deletion. */ printf("NAT64LSN: %s: destroying PG %p " "with non-empty chunk %d\n", __func__, pg, i); } } nat64lsn_destroy_pg(pg); } uma_zfree(nat64lsn_pgchunk_zone, ji->pgchunk); uma_zfree(nat64lsn_job_zone, ji); } static int nat64lsn_request_host(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_job_item *ji; ji = nat64lsn_create_job(cfg, JTYPE_NEWHOST); if (ji != NULL) { ji->m = *mp; ji->f_id = *f_id; ji->faddr = faddr; ji->port = port; ji->proto = proto; ji->src6_hval = hval; nat64lsn_enqueue_job(cfg, ji); NAT64STAT_INC(&cfg->base.stats, jhostsreq); *mp = NULL; } return (IP_FW_DENY); } static int nat64lsn_request_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_job_item *ji; ji = nat64lsn_create_job(cfg, JTYPE_NEWPORTGROUP); if (ji != NULL) { ji->m = *mp; ji->f_id = *f_id; ji->faddr = faddr; ji->port = port; ji->proto = proto; ji->state_hval = hval; ji->host = host; nat64lsn_enqueue_job(cfg, ji); NAT64STAT_INC(&cfg->base.stats, jportreq); *mp = NULL; } return (IP_FW_DENY); } static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags) { struct pfloghdr loghdr, *logdata; int ret; uint16_t ts; /* Update timestamp and flags if needed */ SET_AGE(ts); if (state->timestamp != ts) state->timestamp = ts; if ((state->flags & flags) != 0) state->flags |= flags; if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64lsn_log(logdata, *mp, AF_INET6, state); } else logdata = NULL; ret = nat64_do_handle_ip6(*mp, htonl(state->ip_src), htons(state->aport), &cfg->base, logdata); if (ret == NAT64SKIP) return (cfg->nomatch_verdict); if (ret == NAT64RETURN) *mp = NULL; return (IP_FW_DENY); } static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id, struct mbuf **mp) { struct nat64lsn_state *state; struct nat64lsn_host *host; struct icmp6_hdr *icmp6; uint32_t addr, hval, data[2]; int offset, proto; uint16_t port; uint8_t flags; /* Check if protocol is supported */ port = f_id->src_port; proto = f_id->proto; switch (f_id->proto) { case IPPROTO_ICMPV6: /* * For ICMPv6 echo reply/request we use icmp6_id as * local port. */ offset = 0; proto = nat64_getlasthdr(*mp, &offset); if (proto < 0) { NAT64STAT_INC(&cfg->base.stats, dropped); DPRINTF(DP_DROPS, "mbuf isn't contigious"); return (IP_FW_DENY); } if (proto == IPPROTO_ICMPV6) { icmp6 = mtodo(*mp, offset); if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST || icmp6->icmp6_type == ICMP6_ECHO_REPLY) port = ntohs(icmp6->icmp6_id); } proto = IPPROTO_ICMP; /* FALLTHROUGH */ case IPPROTO_TCP: case IPPROTO_UDP: break; default: NAT64STAT_INC(&cfg->base.stats, noproto); return (cfg->nomatch_verdict); } /* Extract IPv4 from destination IPv6 address */ addr = nat64_extract_ip4(&f_id->dst_ip6, cfg->base.plat_plen); if (addr == 0 || nat64_check_private_ip4(&cfg->base, addr) != 0) { char a[INET_ADDRSTRLEN]; NAT64STAT_INC(&cfg->base.stats, dropped); DPRINTF(DP_DROPS, "dropped due to embedded IPv4 address %s", inet_ntop(AF_INET, &addr, a, sizeof(a))); return (IP_FW_DENY); /* XXX: add extra stats? */ } /* Try to find host */ hval = HOST_HVAL(cfg, &f_id->src_ip6); CK_SLIST_FOREACH(host, &HOSTS(cfg, hval), entries) { if (IN6_ARE_ADDR_EQUAL(&f_id->src_ip6, &host->addr)) break; } /* We use IPv4 address in host byte order */ addr = ntohl(addr); if (host == NULL) return (nat64lsn_request_host(cfg, f_id, mp, hval, addr, port, proto)); flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags); data[0] = addr; data[1] = (f_id->dst_port << 16) | port; hval = STATE_HVAL(cfg, data); state = nat64lsn_get_state6to4(cfg, host, f_id, hval, addr, port, proto); if (state == NULL) return (nat64lsn_request_pg(cfg, host, f_id, mp, hval, addr, port, proto)); return (nat64lsn_translate6_internal(cfg, mp, state, flags)); } /* * Main dataplane entry point. */ int ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { struct nat64lsn_cfg *cfg; ipfw_insn *icmd; int ret; IPFW_RLOCK_ASSERT(ch); *done = 0; /* continue the search in case of failure */ icmd = cmd + 1; if (cmd->opcode != O_EXTERNAL_ACTION || cmd->arg1 != V_nat64lsn_eid || icmd->opcode != O_EXTERNAL_INSTANCE || (cfg = NAT64_LOOKUP(ch, icmd)) == NULL) return (IP_FW_DENY); *done = 1; /* terminate the search */ switch (args->f_id.addr_type) { case 4: ret = nat64lsn_translate4(cfg, &args->f_id, &args->m); break; case 6: /* * Check that destination IPv6 address matches our prefix6. */ if ((cfg->base.flags & NAT64LSN_ANYPREFIX) == 0 && memcmp(&args->f_id.dst_ip6, &cfg->base.plat_prefix, cfg->base.plat_plen / 8) != 0) { ret = cfg->nomatch_verdict; break; } ret = nat64lsn_translate6(cfg, &args->f_id, &args->m); break; default: ret = cfg->nomatch_verdict; } if (ret != IP_FW_PASS && args->m != NULL) { m_freem(args->m); args->m = NULL; } return (ret); } static int nat64lsn_state_ctor(void *mem, int size, void *arg, int flags) { struct nat64lsn_states_chunk *chunk; int i; chunk = (struct nat64lsn_states_chunk *)mem; for (i = 0; i < 64; i++) chunk->state[i].flags = 0; return (0); } void nat64lsn_init_internal(void) { nat64lsn_host_zone = uma_zcreate("NAT64LSN hosts", sizeof(struct nat64lsn_host), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_pgchunk_zone = uma_zcreate("NAT64LSN portgroup chunks", sizeof(struct nat64lsn_pgchunk), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_pg_zone = uma_zcreate("NAT64LSN portgroups", sizeof(struct nat64lsn_pg), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_aliaslink_zone = uma_zcreate("NAT64LSN links", sizeof(struct nat64lsn_aliaslink), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_state_zone = uma_zcreate("NAT64LSN states", sizeof(struct nat64lsn_states_chunk), nat64lsn_state_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_job_zone = uma_zcreate("NAT64LSN jobs", sizeof(struct nat64lsn_job_item), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); JQUEUE_LOCK_INIT(); } void nat64lsn_uninit_internal(void) { /* XXX: epoch_task drain */ JQUEUE_LOCK_DESTROY(); uma_zdestroy(nat64lsn_host_zone); uma_zdestroy(nat64lsn_pgchunk_zone); uma_zdestroy(nat64lsn_pg_zone); uma_zdestroy(nat64lsn_aliaslink_zone); uma_zdestroy(nat64lsn_state_zone); uma_zdestroy(nat64lsn_job_zone); } void nat64lsn_start_instance(struct nat64lsn_cfg *cfg) { CALLOUT_LOCK(cfg); callout_reset(&cfg->periodic, hz * PERIODIC_DELAY, nat64lsn_periodic, cfg); CALLOUT_UNLOCK(cfg); } struct nat64lsn_cfg * nat64lsn_init_instance(struct ip_fw_chain *ch, in_addr_t prefix, int plen) { struct nat64lsn_cfg *cfg; struct nat64lsn_alias *alias; int i, naddr; cfg = malloc(sizeof(struct nat64lsn_cfg), M_NAT64LSN, M_WAITOK | M_ZERO); CFG_LOCK_INIT(cfg); CALLOUT_LOCK_INIT(cfg); STAILQ_INIT(&cfg->jhead); cfg->vp = curvnet; COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK); cfg->hash_seed = arc4random(); cfg->hosts_hashsize = NAT64LSN_HOSTS_HSIZE; cfg->hosts_hash = malloc(sizeof(struct nat64lsn_hosts_slist) * cfg->hosts_hashsize, M_NAT64LSN, M_WAITOK | M_ZERO); for (i = 0; i < cfg->hosts_hashsize; i++) CK_SLIST_INIT(&cfg->hosts_hash[i]); naddr = 1 << (32 - plen); cfg->prefix4 = prefix; cfg->pmask4 = prefix | (naddr - 1); cfg->plen4 = plen; cfg->aliases = malloc(sizeof(struct nat64lsn_alias) * naddr, M_NAT64LSN, M_WAITOK | M_ZERO); for (i = 0; i < naddr; i++) { alias = &cfg->aliases[i]; alias->addr = prefix + i; /* host byte order */ CK_SLIST_INIT(&alias->hosts); ALIAS_LOCK_INIT(alias); } callout_init_mtx(&cfg->periodic, &cfg->periodic_lock, 0); callout_init(&cfg->jcallout, CALLOUT_MPSAFE); return (cfg); } static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg) { int i; if (pg->chunks_count == 1) { uma_zfree(nat64lsn_state_zone, pg->states); } else { for (i = 0; i < pg->chunks_count; i++) uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); free(pg->states_chunk, M_NAT64LSN); free(pg->freemask_chunk, M_NAT64LSN); } uma_zfree(nat64lsn_pg_zone, pg); } static void nat64lsn_destroy_alias(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias) { struct nat64lsn_pg *pg; int i; while (!CK_SLIST_EMPTY(&alias->portgroups)) { pg = CK_SLIST_FIRST(&alias->portgroups); CK_SLIST_REMOVE_HEAD(&alias->portgroups, entries); nat64lsn_destroy_pg(pg); } for (i = 0; i < 32; i++) { if (ISSET32(alias->tcp_chunkmask, i)) uma_zfree(nat64lsn_pgchunk_zone, alias->tcp[i]); if (ISSET32(alias->udp_chunkmask, i)) uma_zfree(nat64lsn_pgchunk_zone, alias->udp[i]); if (ISSET32(alias->icmp_chunkmask, i)) uma_zfree(nat64lsn_pgchunk_zone, alias->icmp[i]); } ALIAS_LOCK_DESTROY(alias); } static void nat64lsn_destroy_host(struct nat64lsn_host *host) { struct nat64lsn_aliaslink *link; while (!CK_SLIST_EMPTY(&host->aliases)) { link = CK_SLIST_FIRST(&host->aliases); CK_SLIST_REMOVE_HEAD(&host->aliases, host_entries); ALIAS_LOCK(link->alias); CK_SLIST_REMOVE(&link->alias->hosts, link, nat64lsn_aliaslink, alias_entries); link->alias->hosts_count--; ALIAS_UNLOCK(link->alias); uma_zfree(nat64lsn_aliaslink_zone, link); } HOST_LOCK_DESTROY(host); free(host->states_hash, M_NAT64LSN); uma_zfree(nat64lsn_host_zone, host); } void nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg) { struct nat64lsn_host *host; int i; CALLOUT_LOCK(cfg); callout_drain(&cfg->periodic); CALLOUT_UNLOCK(cfg); callout_drain(&cfg->jcallout); for (i = 0; i < cfg->hosts_hashsize; i++) { while (!CK_SLIST_EMPTY(&cfg->hosts_hash[i])) { host = CK_SLIST_FIRST(&cfg->hosts_hash[i]); CK_SLIST_REMOVE_HEAD(&cfg->hosts_hash[i], entries); nat64lsn_destroy_host(host); } } for (i = 0; i < (1 << (32 - cfg->plen4)); i++) nat64lsn_destroy_alias(cfg, &cfg->aliases[i]); CALLOUT_LOCK_DESTROY(cfg); CFG_LOCK_DESTROY(cfg); COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS); free(cfg->hosts_hash, M_NAT64LSN); free(cfg->aliases, M_NAT64LSN); free(cfg, M_NAT64LSN); } Index: head/sys/sys/epoch.h =================================================================== --- head/sys/sys/epoch.h (revision 356754) +++ head/sys/sys/epoch.h (revision 356755) @@ -1,107 +1,108 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018, Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EPOCH_H_ #define _SYS_EPOCH_H_ struct epoch_context { void *data[2]; } __aligned(sizeof(void *)); typedef struct epoch_context *epoch_context_t; #ifdef _KERNEL #include #include #include struct epoch; typedef struct epoch *epoch_t; #define EPOCH_PREEMPT 0x1 #define EPOCH_LOCKED 0x2 extern epoch_t global_epoch; extern epoch_t global_epoch_preempt; struct epoch_tracker { TAILQ_ENTRY(epoch_tracker) et_link; struct thread *et_td; ck_epoch_section_t et_section; #ifdef EPOCH_TRACE struct epoch *et_epoch; SLIST_ENTRY(epoch_tracker) et_tlink; const char *et_file; int et_line; #endif } __aligned(sizeof(void *)); typedef struct epoch_tracker *epoch_tracker_t; epoch_t epoch_alloc(const char *name, int flags); void epoch_free(epoch_t epoch); void epoch_wait(epoch_t epoch); void epoch_wait_preempt(epoch_t epoch); void epoch_drain_callbacks(epoch_t epoch); void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)); int in_epoch(epoch_t epoch); int in_epoch_verbose(epoch_t epoch, int dump_onfail); DPCPU_DECLARE(int, epoch_cb_count); DPCPU_DECLARE(struct grouptask, epoch_cb_task); #ifdef EPOCH_TRACE #define EPOCH_FILE_LINE , const char *file, int line #else #define EPOCH_FILE_LINE #endif void _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); void _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); #ifdef EPOCH_TRACE void epoch_trace_list(struct thread *); #define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et, __FILE__, __LINE__) #define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et, __FILE__, __LINE__) #else #define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et) #define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et) #endif void epoch_enter(epoch_t epoch); void epoch_exit(epoch_t epoch); /* * Globally recognized epochs in the FreeBSD kernel. */ /* Network preemptible epoch, declared in sys/net/if.c. */ extern epoch_t net_epoch_preempt; #define NET_EPOCH_ENTER(et) epoch_enter_preempt(net_epoch_preempt, &(et)) #define NET_EPOCH_EXIT(et) epoch_exit_preempt(net_epoch_preempt, &(et)) #define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) +#define NET_EPOCH_CALL(f, c) epoch_call(net_epoch_preempt, (c), (f)) #define NET_EPOCH_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #endif /* _KERNEL */ #endif /* _SYS_EPOCH_H_ */